[v4,1/3] Reduce CAS in low level locks [BZ #28537]

Message ID 20211110001614.2087610-2-hjl.tools@gmail.com
State Superseded
Headers
Series Optimize CAS [BZ #28537] |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

H.J. Lu Nov. 10, 2021, 12:16 a.m. UTC
  CAS instruction is expensive.  From the x86 CPU's point of view, getting
a cache line for writing is more expensive than reading.  See Appendix
A.2 Spinlock in:

https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf

The full compare and swap will grab the cache line exclusive and cause
excessive cache line bouncing.

1. Change low level locks to do an atomic load and skip CAS if compare
may fail to reduce cache line bouncing on contended locks.
2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with
atomic_compare_and_exchange_val_acq and pass down the result to
__lll_lock_wait and __lll_lock_wait_private to avoid the redundant load
there.
3. Drop __glibc_unlikely in __lll_trylock and lll_cond_trylock since we
don't know if it's actually rare; in the contended case it is clearly not
rare.
---
 nptl/lowlevellock.c         | 12 ++++++------
 sysdeps/nptl/lowlevellock.h | 29 ++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 15 deletions(-)
  

Comments

Noah Goldstein Nov. 10, 2021, 1:56 a.m. UTC | #1
On Tue, Nov 9, 2021 at 6:21 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> CAS instruction is expensive.  From the x86 CPU's point of view, getting
> a cache line for writing is more expensive than reading.  See Appendix
> A.2 Spinlock in:
>
> https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
>
> The full compare and swap will grab the cache line exclusive and cause
> excessive cache line bouncing.
>
> 1. Change low level locks to do an atomic load and skip CAS if compare
> may fail to reduce cache line bouncing on contended locks.
> 2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with
> atomic_compare_and_exchange_val_acq and pass down the result to
> __lll_lock_wait and __lll_lock_wait_private to avoid the redundant load
> there.
> 3. Drop __glibc_unlikely in __lll_trylock and lll_cond_trylock since we
> don't know if it's actually rare; in the contended case it is clearly not
> rare.

Think especially since you have a manual check for old vs new, the atomic
CAS failure should still be wrapped in a `glibc_unlikely`.

> ---
>  nptl/lowlevellock.c         | 12 ++++++------
>  sysdeps/nptl/lowlevellock.h | 29 ++++++++++++++++++++---------
>  2 files changed, 26 insertions(+), 15 deletions(-)
>
> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
> index 8c740529c4..d1965c01ca 100644
> --- a/nptl/lowlevellock.c
> +++ b/nptl/lowlevellock.c
> @@ -22,30 +22,30 @@
>  #include <stap-probe.h>
>
>  void
> -__lll_lock_wait_private (int *futex)
> +__lll_lock_wait_private (int *futex, int futex_value)
>  {
> -  if (atomic_load_relaxed (futex) == 2)
> +  if (futex_value == 2)
>      goto futex;
>
>    while (atomic_exchange_acquire (futex, 2) != 0)
>      {
>      futex:
> -      LIBC_PROBE (lll_lock_wait_private, 1, futex);
> +      LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value);
>        futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>      }
>  }
>  libc_hidden_def (__lll_lock_wait_private)
>
>  void
> -__lll_lock_wait (int *futex, int private)
> +__lll_lock_wait (int *futex, int futex_value, int private)
>  {
> -  if (atomic_load_relaxed (futex) == 2)
> +  if (futex_value == 2)
>      goto futex;
>
>    while (atomic_exchange_acquire (futex, 2) != 0)
>      {
>      futex:
> -      LIBC_PROBE (lll_lock_wait, 1, futex);
> +      LIBC_PROBE (lll_lock_wait, 2, futex, futex_value);
>        futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2.  */
>      }
>  }
> diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
> index 4d95114ed3..4235d13de9 100644
> --- a/sysdeps/nptl/lowlevellock.h
> +++ b/sysdeps/nptl/lowlevellock.h
> @@ -66,7 +66,11 @@
>     0.  Otherwise leave lock unchanged and return non-zero to indicate that the
>     lock was not acquired.  */
>  #define __lll_trylock(lock)    \
> -  __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0))
> +  (__extension__ ({                                                    \
> +    __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock);      \
> +    (__lock_value != 0                                                 \
> +     || atomic_compare_and_exchange_bool_acq ((lock), 1, 0));          \
> +  }))
>  #define lll_trylock(lock)      \
>     __lll_trylock (&(lock))
>
> @@ -74,11 +78,15 @@
>     return 0.  Otherwise leave lock unchanged and return non-zero to indicate
>     that the lock was not acquired.  */
>  #define lll_cond_trylock(lock) \
> -  __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))
> +  (__extension__ ({                                                    \
> +    __typeof (lock) __lock_value = atomic_load_relaxed (&(lock));      \
> +    (__lock_value != 0                                                 \
> +     || atomic_compare_and_exchange_bool_acq (&(lock), 2, 0));         \
> +  }))
>
> -extern void __lll_lock_wait_private (int *futex);
> +extern void __lll_lock_wait_private (int *futex, int futex_value);
>  libc_hidden_proto (__lll_lock_wait_private)
> -extern void __lll_lock_wait (int *futex, int private);
> +extern void __lll_lock_wait (int *futex, int futex_value, int private);
>  libc_hidden_proto (__lll_lock_wait)
>
>  /* This is an expression rather than a statement even though its value is
> @@ -95,13 +103,15 @@ libc_hidden_proto (__lll_lock_wait)
>    ((void)                                                               \
>     ({                                                                   \
>       int *__futex = (futex);                                            \
> -     if (__glibc_unlikely                                               \
> -         (atomic_compare_and_exchange_bool_acq (__futex, 1, 0)))        \
> +     int __futex_value = atomic_load_relaxed (futex);                   \
> +     if (__futex_value != 0                                             \
> +         || ((__futex_value = atomic_compare_and_exchange_val_acq       \
> +              (__futex, 1, 0) != 0)))                                   \
>         {                                                                \
>           if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \
> -           __lll_lock_wait_private (__futex);                           \
> +           __lll_lock_wait_private (futex, __futex_value);              \
>           else                                                           \
> -           __lll_lock_wait (__futex, private);                          \
> +           __lll_lock_wait (futex, __futex_value, private);             \
>         }                                                                \
>     }))
>  #define lll_lock(futex, private)       \
> @@ -120,7 +130,8 @@ libc_hidden_proto (__lll_lock_wait)
>     ({                                                                   \
>       int *__futex = (futex);                                            \
>       if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0))      \
> -       __lll_lock_wait (__futex, private);                              \
> +       __lll_lock_wait (__futex, atomic_load_relaxed (__futex),         \
> +                       private); \
>     }))
>  #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)
>
> --
> 2.33.1
>
  

Patch

diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
index 8c740529c4..d1965c01ca 100644
--- a/nptl/lowlevellock.c
+++ b/nptl/lowlevellock.c
@@ -22,30 +22,30 @@ 
 #include <stap-probe.h>
 
 void
-__lll_lock_wait_private (int *futex)
+__lll_lock_wait_private (int *futex, int futex_value)
 {
-  if (atomic_load_relaxed (futex) == 2)
+  if (futex_value == 2)
     goto futex;
 
   while (atomic_exchange_acquire (futex, 2) != 0)
     {
     futex:
-      LIBC_PROBE (lll_lock_wait_private, 1, futex);
+      LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value);
       futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
     }
 }
 libc_hidden_def (__lll_lock_wait_private)
 
 void
-__lll_lock_wait (int *futex, int private)
+__lll_lock_wait (int *futex, int futex_value, int private)
 {
-  if (atomic_load_relaxed (futex) == 2)
+  if (futex_value == 2)
     goto futex;
 
   while (atomic_exchange_acquire (futex, 2) != 0)
     {
     futex:
-      LIBC_PROBE (lll_lock_wait, 1, futex);
+      LIBC_PROBE (lll_lock_wait, 2, futex, futex_value);
       futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2.  */
     }
 }
diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
index 4d95114ed3..4235d13de9 100644
--- a/sysdeps/nptl/lowlevellock.h
+++ b/sysdeps/nptl/lowlevellock.h
@@ -66,7 +66,11 @@ 
    0.  Otherwise leave lock unchanged and return non-zero to indicate that the
    lock was not acquired.  */
 #define __lll_trylock(lock)	\
-  __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0))
+  (__extension__ ({							\
+    __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock);	\
+    (__lock_value != 0							\
+     || atomic_compare_and_exchange_bool_acq ((lock), 1, 0));		\
+  }))
 #define lll_trylock(lock)	\
    __lll_trylock (&(lock))
 
@@ -74,11 +78,15 @@ 
    return 0.  Otherwise leave lock unchanged and return non-zero to indicate
    that the lock was not acquired.  */
 #define lll_cond_trylock(lock)	\
-  __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))
+  (__extension__ ({							\
+    __typeof (lock) __lock_value = atomic_load_relaxed (&(lock));	\
+    (__lock_value != 0							\
+     || atomic_compare_and_exchange_bool_acq (&(lock), 2, 0));		\
+  }))
 
-extern void __lll_lock_wait_private (int *futex);
+extern void __lll_lock_wait_private (int *futex, int futex_value);
 libc_hidden_proto (__lll_lock_wait_private)
-extern void __lll_lock_wait (int *futex, int private);
+extern void __lll_lock_wait (int *futex, int futex_value, int private);
 libc_hidden_proto (__lll_lock_wait)
 
 /* This is an expression rather than a statement even though its value is
@@ -95,13 +103,15 @@  libc_hidden_proto (__lll_lock_wait)
   ((void)                                                               \
    ({                                                                   \
      int *__futex = (futex);                                            \
-     if (__glibc_unlikely                                               \
-         (atomic_compare_and_exchange_bool_acq (__futex, 1, 0)))        \
+     int __futex_value = atomic_load_relaxed (futex);                   \
+     if (__futex_value != 0                                             \
+         || ((__futex_value = atomic_compare_and_exchange_val_acq       \
+              (__futex, 1, 0) != 0)))                                   \
        {                                                                \
          if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \
-           __lll_lock_wait_private (__futex);                           \
+           __lll_lock_wait_private (futex, __futex_value);              \
          else                                                           \
-           __lll_lock_wait (__futex, private);                          \
+           __lll_lock_wait (futex, __futex_value, private);             \
        }                                                                \
    }))
 #define lll_lock(futex, private)	\
@@ -120,7 +130,8 @@  libc_hidden_proto (__lll_lock_wait)
    ({                                                                   \
      int *__futex = (futex);                                            \
      if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0))      \
-       __lll_lock_wait (__futex, private);                              \
+       __lll_lock_wait (__futex, atomic_load_relaxed (__futex),         \
+			private); \
    }))
 #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)