powerpc: Optimize lock elision for pthread_mutex_t

Message ID 55E5F5A3.5020105@linux.vnet.ibm.com
State Superseded
Delegated to: Adhemerval Zanella Netto
Headers

Commit Message

Paul E. Murphy Sept. 1, 2015, 6:59 p.m. UTC
  With TLE enabled, the adapt count variable update incurs
an 8% overhead before entering the critical section of an
elided mutex.

Instead, if it is done right after leaving the critical
section, this serialization can be avoided.

This alters the existing behavior of __lll_trylock_elision
as it will only decrement the adapt_count if it successfully
acquires the lock.

2015-09-01  Paul E. Murphy  <murphyp@linux.vnet.ibm.com>
	* sysdeps/unix/sysv/linux/powerpc/elision-lock.c
	(__lll_lock_elision): Remove adapt_count decrement...
	* sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
	(__lll_trylock_elision): Likewise.
	* sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
	(__lll_unlock_elision): ... to here.
---
 sysdeps/unix/sysv/linux/powerpc/elision-lock.c    |    1 -
 sysdeps/unix/sysv/linux/powerpc/elision-trylock.c |    1 -
 sysdeps/unix/sysv/linux/powerpc/elision-unlock.c  |   10 ++++++++++
 3 files changed, 10 insertions(+), 2 deletions(-)
  

Comments

Adhemerval Zanella Netto Sept. 1, 2015, 7:09 p.m. UTC | #1
On 01-09-2015 15:59, Paul E. Murphy wrote:
> With TLE enabled, the adapt count variable update incurs
> an 8% overhead before entering the critical section of an
> elided mutex.
> 
> Instead, if it is done right after leaving the critical
> section, this serialization can be avoided.
> 
> This alters the existing behavior of __lll_trylock_elision
> as it will only decrement the adapt_count if it successfully
> acquires the lock.
> 
> 2015-09-01  Paul E. Murphy  <murphyp@linux.vnet.ibm.com>
> 	* sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> 	(__lll_lock_elision): Remove adapt_count decrement...
> 	* sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> 	(__lll_trylock_elision): Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> 	(__lll_unlock_elision): ... to here.
> ---
>  sysdeps/unix/sysv/linux/powerpc/elision-lock.c    |    1 -
>  sysdeps/unix/sysv/linux/powerpc/elision-trylock.c |    1 -
>  sysdeps/unix/sysv/linux/powerpc/elision-unlock.c  |   10 ++++++++++
>  3 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> index 26d272e..3762732 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> @@ -47,7 +47,6 @@ __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
>  {
>    if (*adapt_count > 0)
>      {
> -      (*adapt_count)--;
>        goto use_lock;
>      }
>  
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> index 7b6d1b9..440939c 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> @@ -36,7 +36,6 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>    /* Only try a transaction if it's worth it.  */
>    if (*adapt_count > 0)
>      {
> -      (*adapt_count)--;
>        goto use_lock;
>      }
>  
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> index f04c339..88b6f5b 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> @@ -27,6 +27,16 @@ __lll_unlock_elision(int *lock, int pshared)
>    if (*lock == 0)
>      __builtin_tend (0);
>    else
> +    {
> +    /* Note assumption that the lock _is_ the first element of the structure */
> +    short *adapt_count = &((pthread_mutex_t*)(lock))->__data.__elision;
>      lll_unlock ((*lock), pshared);
> +
> +    /* Update the adapt count AFTER completing the critical section.
> +       Doing this here prevents unneeded stalling when entering
> +       a critical section.  Saving about 8% runtime on P8.  */
> +    if (*adapt_count > 0)
> +      (*adapt_count)--;
> +    }
>    return 0;
>  }
> 

I really dislike this approach of accessing internal variables,
if this is really required to improve performance best way would be
to change the __lll_unlock_elision interface to pass along the
adapt_count as the __lll_lock_elision does.

For other archs you can change the macro to just ignore it.
  

Patch

diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
index 26d272e..3762732 100644
--- a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
@@ -47,7 +47,6 @@  __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
 {
   if (*adapt_count > 0)
     {
-      (*adapt_count)--;
       goto use_lock;
     }
 
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
index 7b6d1b9..440939c 100644
--- a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
@@ -36,7 +36,6 @@  __lll_trylock_elision (int *futex, short *adapt_count)
   /* Only try a transaction if it's worth it.  */
   if (*adapt_count > 0)
     {
-      (*adapt_count)--;
       goto use_lock;
     }
 
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
index f04c339..88b6f5b 100644
--- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
@@ -27,6 +27,16 @@  __lll_unlock_elision(int *lock, int pshared)
   if (*lock == 0)
     __builtin_tend (0);
   else
+    {
+    /* Note assumption that the lock _is_ the first element of the structure */
+    short *adapt_count = &((pthread_mutex_t*)(lock))->__data.__elision;
     lll_unlock ((*lock), pshared);
+
+    /* Update the adapt count AFTER completing the critical section.
+       Doing this here prevents unneeded stalling when entering
+       a critical section.  Saving about 8% runtime on P8.  */
+    if (*adapt_count > 0)
+      (*adapt_count)--;
+    }
   return 0;
 }