[v1] nptl: Add single-threaded optimization to ADAPTIVE_NP mutex

Message ID 20230419011722.1154501-1-goldstein.w.n@gmail.com
State New
Headers
Series [v1] nptl: Add single-threaded optimization to ADAPTIVE_NP mutex |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein April 19, 2023, 1:17 a.m. UTC
  ADAPTIVED_NP mutex is generally the most performant mutex option and
is commonly used so it seems preferable for it to also benefit from
the optimization.

make check passes on linux-x86_64
---
 nptl/pthread_mutex_cond_lock.c |  1 +
 nptl/pthread_mutex_lock.c      | 40 ++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 9 deletions(-)
  

Comments

H.J. Lu April 19, 2023, 3:41 p.m. UTC | #1
On Tue, Apr 18, 2023 at 6:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> ADAPTIVED_NP mutex is generally the most performant mutex option and
> is commonly used so it seems preferable for it to also benefit from
> the optimization.

Adaptive mutex works better than normal mutex when futex syscall is the main
bottleneck.   For single threaded applications or machines with fewer cores,
adaptive mutex may not improve performance.   If it is the case, shouldn't we
focus on improving performance when futex syscall is the bottleneck?

> make check passes on linux-x86_64
> ---
>  nptl/pthread_mutex_cond_lock.c |  1 +
>  nptl/pthread_mutex_lock.c      | 40 ++++++++++++++++++++++++++--------
>  2 files changed, 32 insertions(+), 9 deletions(-)
>
> diff --git a/nptl/pthread_mutex_cond_lock.c b/nptl/pthread_mutex_cond_lock.c
> index f3af514305..fcc90bb16c 100644
> --- a/nptl/pthread_mutex_cond_lock.c
> +++ b/nptl/pthread_mutex_cond_lock.c
> @@ -3,6 +3,7 @@
>  #define LLL_MUTEX_LOCK(mutex) \
>    lll_cond_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
>  #define LLL_MUTEX_LOCK_OPTIMIZED(mutex) LLL_MUTEX_LOCK (mutex)
> +#define LLL_MUTEX_TRYLOCK_OPTIMIZED(mutex) LLL_MUTEX_TRYLOCK (mutex)
>
>  /* Not actually elided so far. Needed? */
>  #define LLL_MUTEX_LOCK_ELISION(mutex)  \
> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> index d4f96c70ef..011bd7488d 100644
> --- a/nptl/pthread_mutex_lock.c
> +++ b/nptl/pthread_mutex_lock.c
> @@ -30,9 +30,9 @@
>  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
>     includes this file.  */
>  #ifndef LLL_MUTEX_LOCK
> -/* lll_lock with single-thread optimization.  */
> -static inline void
> -lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> +
> +static inline int
> +lll_mutex_try_singlethreaded_opt (pthread_mutex_t *mutex, int private)
>  {
>    /* The single-threaded optimization is only valid for private
>       mutexes.  For process-shared mutexes, the mutex could be in a
> @@ -41,16 +41,38 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
>       acquired, POSIX requires that pthread_mutex_lock deadlocks for
>       normal mutexes, so skip the optimization in that case as
>       well.  */
> -  int private = PTHREAD_MUTEX_PSHARED (mutex);
>    if (private == LLL_PRIVATE && SINGLE_THREAD_P && mutex->__data.__lock == 0)
> -    mutex->__data.__lock = 1;
> -  else
> +    {
> +      mutex->__data.__lock = 1;
> +      return 0;
> +    }
> +  return 1;
> +}
> +
> +/* lll_trylock with single-thread optimization.  */
> +static inline int
> +lll_mutex_trylock_optimized (pthread_mutex_t *mutex)
> +{
> +  if (lll_mutex_try_singlethreaded_opt (mutex, PTHREAD_MUTEX_PSHARED (mutex))
> +      == 0)
> +    return 0;
> +  return lll_trylock (mutex->__data.__lock);
> +}
> +
> +/* lll_lock with single-thread optimization.  */
> +static inline void
> +lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> +{
> +  int private = PTHREAD_MUTEX_PSHARED (mutex);
> +  if (lll_mutex_try_singlethreaded_opt (mutex, private))
>      lll_lock (mutex->__data.__lock, private);
>  }
>
>  # define LLL_MUTEX_LOCK(mutex)                                         \
>    lll_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
>  # define LLL_MUTEX_LOCK_OPTIMIZED(mutex) lll_mutex_lock_optimized (mutex)
> +# define LLL_MUTEX_TRYLOCK_OPTIMIZED(mutex) \
> +  lll_mutex_trylock_optimized (mutex)
>  # define LLL_MUTEX_TRYLOCK(mutex) \
>    lll_trylock ((mutex)->__data.__lock)
>  # define LLL_ROBUST_MUTEX_LOCK_MODIFIER 0
> @@ -133,11 +155,11 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
>    else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
>                           == PTHREAD_MUTEX_ADAPTIVE_NP, 1))
>      {
> -      if (LLL_MUTEX_TRYLOCK (mutex) != 0)
> +      if (LLL_MUTEX_TRYLOCK_OPTIMIZED (mutex) != 0)
>         {
>           int cnt = 0;
> -         int max_cnt = MIN (max_adaptive_count (),
> -                            mutex->__data.__spins * 2 + 10);
> +         int max_cnt
> +             = MIN (max_adaptive_count (), mutex->__data.__spins * 2 + 10);
>           int spin_count, exp_backoff = 1;
>           unsigned int jitter = get_jitter ();
>           do
> --
> 2.34.1
>
  
Noah Goldstein April 19, 2023, 4:57 p.m. UTC | #2
On Wed, Apr 19, 2023 at 10:42 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Apr 18, 2023 at 6:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > ADAPTIVED_NP mutex is generally the most performant mutex option and
> > is commonly used so it seems preferable for it to also benefit from
> > the optimization.
>
> Adaptive mutex works better than normal mutex when futex syscall is the main
> bottleneck.   For single threaded applications or machines with fewer cores,
> adaptive mutex may not improve performance.   If it is the case, shouldn't we
> focus on improving performance when futex syscall is the bottleneck?

I think adaptive mutex is more general than just avoiding mutexes. It seems
to be the most general purpose lock (adaptive between spin/futex based on
its usage) and is being pitched as the true default for things like std::mutex.
The single-threaded optimization seems to fit with that mentality.

>
> > make check passes on linux-x86_64
> > ---
> >  nptl/pthread_mutex_cond_lock.c |  1 +
> >  nptl/pthread_mutex_lock.c      | 40 ++++++++++++++++++++++++++--------
> >  2 files changed, 32 insertions(+), 9 deletions(-)
> >
> > diff --git a/nptl/pthread_mutex_cond_lock.c b/nptl/pthread_mutex_cond_lock.c
> > index f3af514305..fcc90bb16c 100644
> > --- a/nptl/pthread_mutex_cond_lock.c
> > +++ b/nptl/pthread_mutex_cond_lock.c
> > @@ -3,6 +3,7 @@
> >  #define LLL_MUTEX_LOCK(mutex) \
> >    lll_cond_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
> >  #define LLL_MUTEX_LOCK_OPTIMIZED(mutex) LLL_MUTEX_LOCK (mutex)
> > +#define LLL_MUTEX_TRYLOCK_OPTIMIZED(mutex) LLL_MUTEX_TRYLOCK (mutex)
> >
> >  /* Not actually elided so far. Needed? */
> >  #define LLL_MUTEX_LOCK_ELISION(mutex)  \
> > diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> > index d4f96c70ef..011bd7488d 100644
> > --- a/nptl/pthread_mutex_lock.c
> > +++ b/nptl/pthread_mutex_lock.c
> > @@ -30,9 +30,9 @@
> >  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
> >     includes this file.  */
> >  #ifndef LLL_MUTEX_LOCK
> > -/* lll_lock with single-thread optimization.  */
> > -static inline void
> > -lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> > +
> > +static inline int
> > +lll_mutex_try_singlethreaded_opt (pthread_mutex_t *mutex, int private)
> >  {
> >    /* The single-threaded optimization is only valid for private
> >       mutexes.  For process-shared mutexes, the mutex could be in a
> > @@ -41,16 +41,38 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> >       acquired, POSIX requires that pthread_mutex_lock deadlocks for
> >       normal mutexes, so skip the optimization in that case as
> >       well.  */
> > -  int private = PTHREAD_MUTEX_PSHARED (mutex);
> >    if (private == LLL_PRIVATE && SINGLE_THREAD_P && mutex->__data.__lock == 0)
> > -    mutex->__data.__lock = 1;
> > -  else
> > +    {
> > +      mutex->__data.__lock = 1;
> > +      return 0;
> > +    }
> > +  return 1;
> > +}
> > +
> > +/* lll_trylock with single-thread optimization.  */
> > +static inline int
> > +lll_mutex_trylock_optimized (pthread_mutex_t *mutex)
> > +{
> > +  if (lll_mutex_try_singlethreaded_opt (mutex, PTHREAD_MUTEX_PSHARED (mutex))
> > +      == 0)
> > +    return 0;
> > +  return lll_trylock (mutex->__data.__lock);
> > +}
> > +
> > +/* lll_lock with single-thread optimization.  */
> > +static inline void
> > +lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> > +{
> > +  int private = PTHREAD_MUTEX_PSHARED (mutex);
> > +  if (lll_mutex_try_singlethreaded_opt (mutex, private))
> >      lll_lock (mutex->__data.__lock, private);
> >  }
> >
> >  # define LLL_MUTEX_LOCK(mutex)                                         \
> >    lll_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
> >  # define LLL_MUTEX_LOCK_OPTIMIZED(mutex) lll_mutex_lock_optimized (mutex)
> > +# define LLL_MUTEX_TRYLOCK_OPTIMIZED(mutex) \
> > +  lll_mutex_trylock_optimized (mutex)
> >  # define LLL_MUTEX_TRYLOCK(mutex) \
> >    lll_trylock ((mutex)->__data.__lock)
> >  # define LLL_ROBUST_MUTEX_LOCK_MODIFIER 0
> > @@ -133,11 +155,11 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
> >    else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
> >                           == PTHREAD_MUTEX_ADAPTIVE_NP, 1))
> >      {
> > -      if (LLL_MUTEX_TRYLOCK (mutex) != 0)
> > +      if (LLL_MUTEX_TRYLOCK_OPTIMIZED (mutex) != 0)
> >         {
> >           int cnt = 0;
> > -         int max_cnt = MIN (max_adaptive_count (),
> > -                            mutex->__data.__spins * 2 + 10);
> > +         int max_cnt
> > +             = MIN (max_adaptive_count (), mutex->__data.__spins * 2 + 10);
> >           int spin_count, exp_backoff = 1;
> >           unsigned int jitter = get_jitter ();
> >           do
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
  

Patch

diff --git a/nptl/pthread_mutex_cond_lock.c b/nptl/pthread_mutex_cond_lock.c
index f3af514305..fcc90bb16c 100644
--- a/nptl/pthread_mutex_cond_lock.c
+++ b/nptl/pthread_mutex_cond_lock.c
@@ -3,6 +3,7 @@ 
 #define LLL_MUTEX_LOCK(mutex) \
   lll_cond_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
 #define LLL_MUTEX_LOCK_OPTIMIZED(mutex) LLL_MUTEX_LOCK (mutex)
+#define LLL_MUTEX_TRYLOCK_OPTIMIZED(mutex) LLL_MUTEX_TRYLOCK (mutex)
 
 /* Not actually elided so far. Needed? */
 #define LLL_MUTEX_LOCK_ELISION(mutex)  \
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index d4f96c70ef..011bd7488d 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -30,9 +30,9 @@ 
 /* Some of the following definitions differ when pthread_mutex_cond_lock.c
    includes this file.  */
 #ifndef LLL_MUTEX_LOCK
-/* lll_lock with single-thread optimization.  */
-static inline void
-lll_mutex_lock_optimized (pthread_mutex_t *mutex)
+
+static inline int
+lll_mutex_try_singlethreaded_opt (pthread_mutex_t *mutex, int private)
 {
   /* The single-threaded optimization is only valid for private
      mutexes.  For process-shared mutexes, the mutex could be in a
@@ -41,16 +41,38 @@  lll_mutex_lock_optimized (pthread_mutex_t *mutex)
      acquired, POSIX requires that pthread_mutex_lock deadlocks for
      normal mutexes, so skip the optimization in that case as
      well.  */
-  int private = PTHREAD_MUTEX_PSHARED (mutex);
   if (private == LLL_PRIVATE && SINGLE_THREAD_P && mutex->__data.__lock == 0)
-    mutex->__data.__lock = 1;
-  else
+    {
+      mutex->__data.__lock = 1;
+      return 0;
+    }
+  return 1;
+}
+
+/* lll_trylock with single-thread optimization.  */
+static inline int
+lll_mutex_trylock_optimized (pthread_mutex_t *mutex)
+{
+  if (lll_mutex_try_singlethreaded_opt (mutex, PTHREAD_MUTEX_PSHARED (mutex))
+      == 0)
+    return 0;
+  return lll_trylock (mutex->__data.__lock);
+}
+
+/* lll_lock with single-thread optimization.  */
+static inline void
+lll_mutex_lock_optimized (pthread_mutex_t *mutex)
+{
+  int private = PTHREAD_MUTEX_PSHARED (mutex);
+  if (lll_mutex_try_singlethreaded_opt (mutex, private))
     lll_lock (mutex->__data.__lock, private);
 }
 
 # define LLL_MUTEX_LOCK(mutex)						\
   lll_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
 # define LLL_MUTEX_LOCK_OPTIMIZED(mutex) lll_mutex_lock_optimized (mutex)
+# define LLL_MUTEX_TRYLOCK_OPTIMIZED(mutex) \
+  lll_mutex_trylock_optimized (mutex)
 # define LLL_MUTEX_TRYLOCK(mutex) \
   lll_trylock ((mutex)->__data.__lock)
 # define LLL_ROBUST_MUTEX_LOCK_MODIFIER 0
@@ -133,11 +155,11 @@  PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
   else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
 			  == PTHREAD_MUTEX_ADAPTIVE_NP, 1))
     {
-      if (LLL_MUTEX_TRYLOCK (mutex) != 0)
+      if (LLL_MUTEX_TRYLOCK_OPTIMIZED (mutex) != 0)
 	{
 	  int cnt = 0;
-	  int max_cnt = MIN (max_adaptive_count (),
-			     mutex->__data.__spins * 2 + 10);
+	  int max_cnt
+	      = MIN (max_adaptive_count (), mutex->__data.__spins * 2 + 10);
 	  int spin_count, exp_backoff = 1;
 	  unsigned int jitter = get_jitter ();
 	  do