[v3,32/37] nptl: pthread_mutex_lock, pthread_mutex_unock single-threaded optimization

Message ID 782ad0d9a371fa66bd54df07413f3d15fba0cf5a.1615914632.git.fweimer@redhat.com
State Superseded
Delegated to: Adhemerval Zanella Netto
Headers
Series libpthread removal: NPTL forwarders are gone |

Commit Message

Florian Weimer March 16, 2021, 5:31 p.m. UTC
  This is optimization is similar in spirit to the SINGLE_THREAD_P check
in the malloc implementation.  Doing this in generic code allows us
to prioritize those cases which are likely to occur in single-threaded
programs (normal and recursive mutexes).
---
 nptl/pthread_mutex_cond_lock.c |  1 +
 nptl/pthread_mutex_lock.c      | 25 ++++++++++++++++++++++---
 nptl/pthread_mutex_unlock.c    | 17 ++++++++++++++++-
 3 files changed, 39 insertions(+), 4 deletions(-)
  

Comments

Adhemerval Zanella March 26, 2021, 6 p.m. UTC | #1
On 16/03/2021 14:31, Florian Weimer via Libc-alpha wrote:
> This is optimization is similar in spirit to the SINGLE_THREAD_P check
> in the malloc implementation.  Doing this in generic code allows us
> to prioritize those cases which are likely to occur in single-threaded
> programs (normal and recursive mutexes).

At first I though about suggesting to move it on generic lowlevellock.h,
but it is used quite only in some specific places and once libpthread is 
moved we can use libc_lock_lock (which uses pthread_mutex) instead.

The only issue with the approach of moving this optimization to 
pthread_mutex_lock (instead of custom lock as lll_lock) is to use a
slight more memory due pthread_mutex_t object, but I think the sparse
usage within glibc should be ok.

It would be good if we disentangle and consolidate both lowlevellock.h 
and libc-lock.h so we have only header and definitions.

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
>  nptl/pthread_mutex_cond_lock.c |  1 +
>  nptl/pthread_mutex_lock.c      | 25 ++++++++++++++++++++++---
>  nptl/pthread_mutex_unlock.c    | 17 ++++++++++++++++-
>  3 files changed, 39 insertions(+), 4 deletions(-)
> 
> diff --git a/nptl/pthread_mutex_cond_lock.c b/nptl/pthread_mutex_cond_lock.c
> index 2f0771302f..3386bd689b 100644
> --- a/nptl/pthread_mutex_cond_lock.c
> +++ b/nptl/pthread_mutex_cond_lock.c
> @@ -2,6 +2,7 @@
>  
>  #define LLL_MUTEX_LOCK(mutex) \
>    lll_cond_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
> +#define LLL_MUTEX_LOCK_OPTIMIZED(mutex) LLL_MUTEX_LOCK (mutex)
>  
>  /* Not actually elided so far. Needed? */
>  #define LLL_MUTEX_LOCK_ELISION(mutex)  \
> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> index f0de7b7fd6..8649a92ffb 100644
> --- a/nptl/pthread_mutex_lock.c
> +++ b/nptl/pthread_mutex_lock.c
> @@ -30,8 +30,27 @@
>  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
>     includes this file.  */
>  #ifndef LLL_MUTEX_LOCK
> -# define LLL_MUTEX_LOCK(mutex) \
> +/* lll_lock with single-thread optimization.  */
> +static inline void
> +lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> +{
> +  /* The single-threaded optimization is only valid for private
> +     mutexes.  For process-shared mutexes, the mutex could be in a
> +     shared mapping, so synchronization with another process is needed
> +     even without any threads.  If the lock is already marked as
> +     acquired, POSIX requires that pthread_mutex_lock deadlocks for
> +     normal mutexes, so skip the optimization in that case as
> +     well.  */
> +  int private = PTHREAD_MUTEX_PSHARED (mutex);
> +  if (private == LLL_PRIVATE && SINGLE_THREAD_P && mutex->__data.__lock == 0)
> +    mutex->__data.__lock = 1;
> +  else
> +    lll_lock (mutex->__data.__lock, private);
> +}
> +
> +# define LLL_MUTEX_LOCK(mutex)						\
>    lll_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))

Ok.

> +# define LLL_MUTEX_LOCK_OPTIMIZED(mutex) lll_mutex_lock_optimized (mutex)
>  # define LLL_MUTEX_TRYLOCK(mutex) \
>    lll_trylock ((mutex)->__data.__lock)
>  # define LLL_ROBUST_MUTEX_LOCK_MODIFIER 0
> @@ -64,7 +83,7 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
>        FORCE_ELISION (mutex, goto elision);
>      simple:
>        /* Normal mutex.  */
> -      LLL_MUTEX_LOCK (mutex);
> +      LLL_MUTEX_LOCK_OPTIMIZED (mutex);
>        assert (mutex->__data.__owner == 0);
>      }
>  #if ENABLE_ELISION_SUPPORT

Ok.

> @@ -99,7 +118,7 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
>  	}
>  
>        /* We have to get the mutex.  */
> -      LLL_MUTEX_LOCK (mutex);
> +      LLL_MUTEX_LOCK_OPTIMIZED (mutex);
>  
>        assert (mutex->__data.__owner == 0);
>        mutex->__data.__count = 1;

Ok.

> diff --git a/nptl/pthread_mutex_unlock.c b/nptl/pthread_mutex_unlock.c
> index 3b5ccdacf9..655093ee9a 100644
> --- a/nptl/pthread_mutex_unlock.c
> +++ b/nptl/pthread_mutex_unlock.c
> @@ -28,6 +28,21 @@ static int
>  __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
>       __attribute_noinline__;
>  
> +/* lll_lock with single-thread optimization.  */
> +static inline void
> +lll_mutex_unlock_optimized (pthread_mutex_t *mutex)
> +{
> +  /* The single-threaded optimization is only valid for private
> +     mutexes.  For process-shared mutexes, the mutex could be in a
> +     shared mapping, so synchronization with another process is needed
> +     even without any threads.  */
> +  int private = PTHREAD_MUTEX_PSHARED (mutex);
> +  if (private == LLL_PRIVATE && SINGLE_THREAD_P)
> +    mutex->__data.__lock = 0;
> +  else
> +    lll_unlock (mutex->__data.__lock, private);
> +}
> +
>  int
>  attribute_hidden
>  __pthread_mutex_unlock_usercnt (pthread_mutex_t *mutex, int decr)
> @@ -51,7 +66,7 @@ __pthread_mutex_unlock_usercnt (pthread_mutex_t *mutex, int decr)
>  	--mutex->__data.__nusers;
>  
>        /* Unlock.  */
> -      lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex));
> +      lll_mutex_unlock_optimized (mutex);
>  
>        LIBC_PROBE (mutex_release, 1, mutex);
>  
> 

Ok.
  

Patch

diff --git a/nptl/pthread_mutex_cond_lock.c b/nptl/pthread_mutex_cond_lock.c
index 2f0771302f..3386bd689b 100644
--- a/nptl/pthread_mutex_cond_lock.c
+++ b/nptl/pthread_mutex_cond_lock.c
@@ -2,6 +2,7 @@ 
 
 #define LLL_MUTEX_LOCK(mutex) \
   lll_cond_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
+#define LLL_MUTEX_LOCK_OPTIMIZED(mutex) LLL_MUTEX_LOCK (mutex)
 
 /* Not actually elided so far. Needed? */
 #define LLL_MUTEX_LOCK_ELISION(mutex)  \
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index f0de7b7fd6..8649a92ffb 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -30,8 +30,27 @@ 
 /* Some of the following definitions differ when pthread_mutex_cond_lock.c
    includes this file.  */
 #ifndef LLL_MUTEX_LOCK
-# define LLL_MUTEX_LOCK(mutex) \
+/* lll_lock with single-thread optimization.  */
+static inline void
+lll_mutex_lock_optimized (pthread_mutex_t *mutex)
+{
+  /* The single-threaded optimization is only valid for private
+     mutexes.  For process-shared mutexes, the mutex could be in a
+     shared mapping, so synchronization with another process is needed
+     even without any threads.  If the lock is already marked as
+     acquired, POSIX requires that pthread_mutex_lock deadlocks for
+     normal mutexes, so skip the optimization in that case as
+     well.  */
+  int private = PTHREAD_MUTEX_PSHARED (mutex);
+  if (private == LLL_PRIVATE && SINGLE_THREAD_P && mutex->__data.__lock == 0)
+    mutex->__data.__lock = 1;
+  else
+    lll_lock (mutex->__data.__lock, private);
+}
+
+# define LLL_MUTEX_LOCK(mutex)						\
   lll_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
+# define LLL_MUTEX_LOCK_OPTIMIZED(mutex) lll_mutex_lock_optimized (mutex)
 # define LLL_MUTEX_TRYLOCK(mutex) \
   lll_trylock ((mutex)->__data.__lock)
 # define LLL_ROBUST_MUTEX_LOCK_MODIFIER 0
@@ -64,7 +83,7 @@  __pthread_mutex_lock (pthread_mutex_t *mutex)
       FORCE_ELISION (mutex, goto elision);
     simple:
       /* Normal mutex.  */
-      LLL_MUTEX_LOCK (mutex);
+      LLL_MUTEX_LOCK_OPTIMIZED (mutex);
       assert (mutex->__data.__owner == 0);
     }
 #if ENABLE_ELISION_SUPPORT
@@ -99,7 +118,7 @@  __pthread_mutex_lock (pthread_mutex_t *mutex)
 	}
 
       /* We have to get the mutex.  */
-      LLL_MUTEX_LOCK (mutex);
+      LLL_MUTEX_LOCK_OPTIMIZED (mutex);
 
       assert (mutex->__data.__owner == 0);
       mutex->__data.__count = 1;
diff --git a/nptl/pthread_mutex_unlock.c b/nptl/pthread_mutex_unlock.c
index 3b5ccdacf9..655093ee9a 100644
--- a/nptl/pthread_mutex_unlock.c
+++ b/nptl/pthread_mutex_unlock.c
@@ -28,6 +28,21 @@  static int
 __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
      __attribute_noinline__;
 
+/* lll_lock with single-thread optimization.  */
+static inline void
+lll_mutex_unlock_optimized (pthread_mutex_t *mutex)
+{
+  /* The single-threaded optimization is only valid for private
+     mutexes.  For process-shared mutexes, the mutex could be in a
+     shared mapping, so synchronization with another process is needed
+     even without any threads.  */
+  int private = PTHREAD_MUTEX_PSHARED (mutex);
+  if (private == LLL_PRIVATE && SINGLE_THREAD_P)
+    mutex->__data.__lock = 0;
+  else
+    lll_unlock (mutex->__data.__lock, private);
+}
+
 int
 attribute_hidden
 __pthread_mutex_unlock_usercnt (pthread_mutex_t *mutex, int decr)
@@ -51,7 +66,7 @@  __pthread_mutex_unlock_usercnt (pthread_mutex_t *mutex, int decr)
 	--mutex->__data.__nusers;
 
       /* Unlock.  */
-      lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex));
+      lll_mutex_unlock_optimized (mutex);
 
       LIBC_PROBE (mutex_release, 1, mutex);