[2/2] PowerPC: libc single-thread lock optimization

Message ID 5361013D.9020005@linux.vnet.ibm.com
State Rejected
Headers

Commit Message

Adhemerval Zanella Netto April 30, 2014, 1:57 p.m. UTC
  This patch adds a single-thread optimization for locks used within
libc.so.  For each lock operations it checks it the process has already
spawned one thread and if not use non-atomic operations.  Other libraries
(libpthread.so for instance) are unaffected by this change.

This is a respin on my first patch to add such optimization, but now the
code is focused only on lowlevellock.h, the atomic.h is untouched.

Tested on powerpc32 and powerpc64.

--

	* nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
	(__lll_is_single_thread): New macro to check if process has spawned
	any threads.
	(__lll_robust_trylock): Add optimization to avoid atomic operations in
	single thread case.
	(lll_lock): Likewise.
	(lll_robust_lock): Likewise.
	(lll_cond_lock): Likewise.
	(lll_robust_cond_lock): Likewise.
	(lll_timedlock): Likewise.
	(lll_robust_timedlock): Likewise.
	(lll_unlock): Likewise.
	(lll_robust_unlock): Likewise.

---
  

Comments

Will Newton May 1, 2014, 9:11 a.m. UTC | #1
On 30 April 2014 14:57, Adhemerval Zanella <azanella@linux.vnet.ibm.com> wrote:
> This patch adds a single-thread optimization for locks used within
> libc.so.  For each lock operations it checks it the process has already
> spawned one thread and if not use non-atomic operations.  Other libraries
> (libpthread.so for instance) are unaffected by this change.
>
> This is a respin on my first patch to add such optimization, but now the
> code is focused only on lowlevellock.h, the atomic.h is untouched.
>
> Tested on powerpc32 and powerpc64.
>
> --
>
>         * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>         (__lll_is_single_thread): New macro to check if process has spawned
>         any threads.
>         (__lll_robust_trylock): Add optimization to avoid atomic operations in
>         single thread case.
>         (lll_lock): Likewise.
>         (lll_robust_lock): Likewise.
>         (lll_cond_lock): Likewise.
>         (lll_robust_cond_lock): Likewise.
>         (lll_timedlock): Likewise.
>         (lll_robust_timedlock): Likewise.
>         (lll_unlock): Likewise.
>         (lll_robust_unlock): Likewise.
>
> ---
>
> diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
> index ab92c3f..38529a4 100644
> --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
> +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
> @@ -76,6 +76,16 @@
>  # endif
>  #endif
>
> +/* For internal libc.so lock calls in single-thread process we use normal
> +   load/stores.  */
> +#if !defined NOT_IN_libc || defined UP

Where does this UP define come from? It seems to date back to the
early days of NPTL but I don't know when it gets set.

> +# define __lll_is_single_thread                                                      \
> +  __glibc_likely (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
> +#else
> +# define __lll_is_single_thread (0)
> +#endif
> +
> +
>  #define lll_futex_wait(futexp, val, private) \
>    lll_futex_timed_wait (futexp, val, NULL, private)
>
> @@ -205,7 +215,9 @@
>  /* Set *futex to ID if it is 0, atomically.  Returns the old value */
>  #define __lll_robust_trylock(futex, id) \
>    ({ int __val;                                                                      \
> -     __asm __volatile ("1:     lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
> +     if (!__lll_is_single_thread)                                            \
> +       __asm __volatile (                                                    \
> +                      "1:      lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>                        "        cmpwi   0,%0,0\n"                             \
>                        "        bne     2f\n"                                 \
>                        "        stwcx.  %3,0,%2\n"                            \
> @@ -214,6 +226,12 @@
>                        : "=&r" (__val), "=m" (*futex)                         \
>                        : "r" (futex), "r" (id), "m" (*futex)                  \
>                        : "cr0", "memory");                                    \
> +     else                                                                    \
> +       {                                                                     \
> +        __val = *futex;                                                      \
> +        if (__val == 0)                                                      \
> +          *futex = id;                                                       \
> +       }                                                                     \
>       __val;                                                                  \
>    })
>
> @@ -237,8 +255,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>  #define lll_lock(lock, private) \
>    (void) ({                                                                  \
>      int *__futex = &(lock);                                                  \
> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
> -                         0) != 0)                                            \
> +    int __tmp;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);           \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = *__futex;                                                     \
> +       if (__tmp == 0)                                                       \
> +         *__futex = 1;                                                       \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0) != 0)                                    \

Should this be __glibc_unlikely? e.g.:

__glibc_unlikely(__tmp != 0)

>        {                                                                              \
>         if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)       \
>           __lll_lock_wait_private (__futex);                                  \
> @@ -251,8 +277,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>    ({                                                                         \
>      int *__futex = &(lock);                                                  \
>      int __val = 0;                                                           \
> -    if (__builtin_expect (atomic_compare_and_exchange_bool_acq (__futex, id,  \
> -                                                               0), 0))       \
> +    int __tmp;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_bool_acq (__futex, id, 0);         \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = (*__futex == 0);                                              \
> +       if (__tmp)                                                            \
> +         *__futex = id;                                                      \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0))                                         \

Likewise (and more places below).

>        __val = __lll_robust_lock_wait (__futex, private);                     \
>      __val;                                                                   \
>    })
> @@ -260,8 +294,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>  #define lll_cond_lock(lock, private) \
>    (void) ({                                                                  \
>      int *__futex = &(lock);                                                  \
> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 2, 0),\
> -                         0) != 0)                                            \
> +    int __tmp;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 2, 0);           \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = *__futex;                                                     \
> +       if (__tmp == 0)                                                       \
> +         *__futex = 2;                                                       \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0) != 0)                                    \
>        __lll_lock_wait (__futex, private);                                    \
>    })
>
> @@ -269,9 +311,17 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>    ({                                                                         \
>      int *__futex = &(lock);                                                  \
>      int __val = 0;                                                           \
> +    int __tmp;                                                               \
>      int __id = id | FUTEX_WAITERS;                                           \
> -    if (__builtin_expect (atomic_compare_and_exchange_bool_acq (__futex, __id,\
> -                                                               0), 0))       \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_bool_acq (__futex, __id, 0);       \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = (*__futex == 0);                                              \
> +       if (__tmp)                                                            \
> +         *__futex = id;                                                      \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0))                                         \
>        __val = __lll_robust_lock_wait (__futex, private);                     \
>      __val;                                                                   \
>    })
> @@ -286,8 +336,16 @@ extern int __lll_robust_timedlock_wait
>    ({                                                                         \
>      int *__futex = &(lock);                                                  \
>      int __val = 0;                                                           \
> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
> -                         0) != 0)                                            \
> +    int __tmp;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);           \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = *__futex;                                                     \
> +       if (__tmp == 0)                                                       \
> +         *__futex = 1;                                                       \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0) != 0)                                    \
>        __val = __lll_timedlock_wait (__futex, abstime, private);                      \
>      __val;                                                                   \
>    })
> @@ -296,8 +354,16 @@ extern int __lll_robust_timedlock_wait
>    ({                                                                         \
>      int *__futex = &(lock);                                                  \
>      int __val = 0;                                                           \
> -    if (__builtin_expect (atomic_compare_and_exchange_bool_acq (__futex, id,  \
> -                                                               0), 0))       \
> +    int __tmp;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_bool_acq (__futex, id, 0);         \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = (*__futex == 0);                                              \
> +       if (__tmp)                                                            \
> +         *__futex = id;                                                      \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0))                                         \
>        __val = __lll_robust_timedlock_wait (__futex, abstime, private);       \
>      __val;                                                                   \
>    })
> @@ -305,7 +371,14 @@ extern int __lll_robust_timedlock_wait
>  #define lll_unlock(lock, private) \
>    ((void) ({                                                                 \
>      int *__futex = &(lock);                                                  \
> -    int __val = atomic_exchange_rel (__futex, 0);                            \
> +    int __val;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __val = atomic_exchange_rel (__futex, 0);                                      \
> +    else                                                                     \
> +      {                                                                              \
> +       __val = *__futex;                                                     \
> +       *__futex = 0;                                                         \
> +      }                                                                              \
>      if (__glibc_unlikely (__val > 1))                                        \
>        lll_futex_wake (__futex, 1, private);                                  \
>    }))
> @@ -313,7 +386,14 @@ extern int __lll_robust_timedlock_wait
>  #define lll_robust_unlock(lock, private) \
>    ((void) ({                                                                 \
>      int *__futex = &(lock);                                                  \
> -    int __val = atomic_exchange_rel (__futex, 0);                            \
> +    int __val;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __val = atomic_exchange_rel (__futex, 0);                                      \
> +    else                                                                     \
> +      {                                                                              \
> +       __val = *__futex;                                                     \
> +       *__futex = 0;                                                         \
> +      }                                                                              \
>      if (__glibc_unlikely (__val & FUTEX_WAITERS))                            \
>        lll_futex_wake (__futex, 1, private);                                  \
>    }))
> --
> 1.8.4.2
>
  
Adhemerval Zanella Netto May 1, 2014, 4:17 p.m. UTC | #2
Thanks for the review Will.


On 01-05-2014 06:11, Will Newton wrote:
> On 30 April 2014 14:57, Adhemerval Zanella <azanella@linux.vnet.ibm.com> wrote:
>> diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>> index ab92c3f..38529a4 100644
>> --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>> +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>> @@ -76,6 +76,16 @@
>>  # endif
>>  #endif
>>
>> +/* For internal libc.so lock calls in single-thread process we use normal
>> +   load/stores.  */
>> +#if !defined NOT_IN_libc || defined UP
> Where does this UP define come from? It seems to date back to the
> early days of NPTL but I don't know when it gets set.

In fact I didn't git into too much to find, I added just to keep more close to x86
implementation.  In fact I think it is safe to just drop its use.


>> +# define __lll_is_single_thread                                                      \
>> +  __glibc_likely (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
>> +#else
>> +# define __lll_is_single_thread (0)
>> +#endif
>> +
>> +
>>  #define lll_futex_wait(futexp, val, private) \
>>    lll_futex_timed_wait (futexp, val, NULL, private)
>>
>> @@ -205,7 +215,9 @@
>>  /* Set *futex to ID if it is 0, atomically.  Returns the old value */
>>  #define __lll_robust_trylock(futex, id) \
>>    ({ int __val;                                                                      \
>> -     __asm __volatile ("1:     lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>> +     if (!__lll_is_single_thread)                                            \
>> +       __asm __volatile (                                                    \
>> +                      "1:      lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>>                        "        cmpwi   0,%0,0\n"                             \
>>                        "        bne     2f\n"                                 \
>>                        "        stwcx.  %3,0,%2\n"                            \
>> @@ -214,6 +226,12 @@
>>                        : "=&r" (__val), "=m" (*futex)                         \
>>                        : "r" (futex), "r" (id), "m" (*futex)                  \
>>                        : "cr0", "memory");                                    \
>> +     else                                                                    \
>> +       {                                                                     \
>> +        __val = *futex;                                                      \
>> +        if (__val == 0)                                                      \
>> +          *futex = id;                                                       \
>> +       }                                                                     \
>>       __val;                                                                  \
>>    })
>>
>> @@ -237,8 +255,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>>  #define lll_lock(lock, private) \
>>    (void) ({                                                                  \
>>      int *__futex = &(lock);                                                  \
>> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
>> -                         0) != 0)                                            \
>> +    int __tmp;                                                               \
>> +    if (!__lll_is_single_thread)                                             \
>> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);           \
>> +    else                                                                     \
>> +      {                                                                              \
>> +       __tmp = *__futex;                                                     \
>> +       if (__tmp == 0)                                                       \
>> +         *__futex = 1;                                                       \
>> +      }                                                                              \
>> +    if (__builtin_expect (__tmp, 0) != 0)                                    \
> Should this be __glibc_unlikely? e.g.:
>
> __glibc_unlikely(__tmp != 0)

I will fix it, thanks.
  
Torvald Riegel May 2, 2014, 3:42 p.m. UTC | #3
On Wed, 2014-04-30 at 10:57 -0300, Adhemerval Zanella wrote:
> This patch adds a single-thread optimization for locks used within
> libc.so.  For each lock operations it checks it the process has already
> spawned one thread and if not use non-atomic operations.  Other libraries
> (libpthread.so for instance) are unaffected by this change.
> 
> This is a respin on my first patch to add such optimization, but now the
> code is focused only on lowlevellock.h, the atomic.h is untouched.
> 
> Tested on powerpc32 and powerpc64.
> 
> --
> 
>         * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>         (__lll_is_single_thread): New macro to check if process has spawned
>         any threads.
>         (__lll_robust_trylock): Add optimization to avoid atomic operations in
>         single thread case.
>         (lll_lock): Likewise.
>         (lll_robust_lock): Likewise.
>         (lll_cond_lock): Likewise.
>         (lll_robust_cond_lock): Likewise.
>         (lll_timedlock): Likewise.
>         (lll_robust_timedlock): Likewise.
>         (lll_unlock): Likewise.
>         (lll_robust_unlock): Likewise.
> 
> ---
> 
> diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
> index ab92c3f..38529a4 100644
> --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
> +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
> @@ -76,6 +76,16 @@
>  # endif
>  #endif
>  
> +/* For internal libc.so lock calls in single-thread process we use normal
> +   load/stores.  */
> +#if !defined NOT_IN_libc || defined UP
> +# define __lll_is_single_thread                                                      \
> +  __glibc_likely (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)

I disagree that single-threaded execution should be the likely case.
There is a large body of existing single-threaded code.  But what's the
microbenchmarks or data that we base this decision on?

> +#else
> +# define __lll_is_single_thread (0)
> +#endif
> +
> +
>  #define lll_futex_wait(futexp, val, private) \
>    lll_futex_timed_wait (futexp, val, NULL, private)
>  
> @@ -205,7 +215,9 @@
>  /* Set *futex to ID if it is 0, atomically.  Returns the old value */
>  #define __lll_robust_trylock(futex, id) \
>    ({ int __val;                                                                      \
> -     __asm __volatile ("1:     lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
> +     if (!__lll_is_single_thread)                                            \
> +       __asm __volatile (                                                    \
> +                      "1:      lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>                        "        cmpwi   0,%0,0\n"                             \
>                        "        bne     2f\n"                                 \
>                        "        stwcx.  %3,0,%2\n"                            \
> @@ -214,6 +226,12 @@
>                        : "=&r" (__val), "=m" (*futex)                         \
>                        : "r" (futex), "r" (id), "m" (*futex)                  \
>                        : "cr0", "memory");                                    \
> +     else                                                                    \
> +       {                                                                     \
> +        __val = *futex;                                                      \
> +        if (__val == 0)                                                      \
> +          *futex = id;                                                       \
> +       }                                                                     \
>       __val;                                                                  \
>    })

Conceptually, you can safely use trylock in signal handlers, because
it's not a blocking operation.  And this is used for normal and robust
locks.  I haven't checked all the trylock code and all uses to see
whether it is indeed AS-Safe, but I'm pretty sure this change is wrong.
At the very least, it doesn't document that trylock is now AS-Unsafe.

Regarding the robust locks case: What is using it outside of nptl?  And
if so, what's the point of using an explicitly robust lock if there's no
concurrency?  Who's going to clean up afterwards?
 
> @@ -237,8 +255,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>  #define lll_lock(lock, private) \
>    (void) ({                                                                  \
>      int *__futex = &(lock);                                                  \
> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
> -                         0) != 0)                                            \
> +    int __tmp;                                                               \
> +    if (!__lll_is_single_thread)                                             \
> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);           \
> +    else                                                                     \
> +      {                                                                              \
> +       __tmp = *__futex;                                                     \
> +       if (__tmp == 0)                                                       \
> +         *__futex = 1;                                                       \
> +      }                                                                              \
> +    if (__builtin_expect (__tmp, 0) != 0)                                    \
>        {                                                                              \
>         if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)       \
>           __lll_lock_wait_private (__futex);                                  \

This is obviously wrong for anything in nptl because you don't check
private.  You defined lll_is_single_thread so that it's always 0 in
nptl, so this works -- but it's not increasing the clarity of the code.

You also assume that nobody outside of nptl will call lll_lock with
private not set to LLL_PRIVATE.  This seems error-prone.  Or can you
show why that's fine?

Also, you add this code to all low-level locks independently of whether
they are actually used outside of nptl or not.  Have you looked at
libc_lock.h?  I'm not familiar with how non-nptl code uses the locks,
but it seems it's calling nptl code, so the change wouldn't have any
effect.

Finally, for a change as this whose effect on performance is
non-obvious, I think we really need performance measurements.  We need
to have microbenchmarks so that we can track the change of performance.
  
Adhemerval Zanella Netto May 2, 2014, 5:40 p.m. UTC | #4
On 02-05-2014 12:42, Torvald Riegel wrote:
> On Wed, 2014-04-30 at 10:57 -0300, Adhemerval Zanella wrote:
>> This patch adds a single-thread optimization for locks used within
>> libc.so.  For each lock operations it checks it the process has already
>> spawned one thread and if not use non-atomic operations.  Other libraries
>> (libpthread.so for instance) are unaffected by this change.
>>
>> This is a respin on my first patch to add such optimization, but now the
>> code is focused only on lowlevellock.h, the atomic.h is untouched.
>>
>> Tested on powerpc32 and powerpc64.
>>
>> --
>>
>>         * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>>         (__lll_is_single_thread): New macro to check if process has spawned
>>         any threads.
>>         (__lll_robust_trylock): Add optimization to avoid atomic operations in
>>         single thread case.
>>         (lll_lock): Likewise.
>>         (lll_robust_lock): Likewise.
>>         (lll_cond_lock): Likewise.
>>         (lll_robust_cond_lock): Likewise.
>>         (lll_timedlock): Likewise.
>>         (lll_robust_timedlock): Likewise.
>>         (lll_unlock): Likewise.
>>         (lll_robust_unlock): Likewise.
>>
>> ---
>>
>> diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>> index ab92c3f..38529a4 100644
>> --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>> +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>> @@ -76,6 +76,16 @@
>>  # endif
>>  #endif
>>  
>> +/* For internal libc.so lock calls in single-thread process we use normal
>> +   load/stores.  */
>> +#if !defined NOT_IN_libc || defined UP
>> +# define __lll_is_single_thread                                                      \
>> +  __glibc_likely (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
> I disagree that single-threaded execution should be the likely case.
> There is a large body of existing single-threaded code.  But what's the
> microbenchmarks or data that we base this decision on?

Fair enough, in fact I think changing to add no branch prediction and let either
the compiler or the chip to prediction would be better.

>
>> +#else
>> +# define __lll_is_single_thread (0)
>> +#endif
>> +
>> +
>>  #define lll_futex_wait(futexp, val, private) \
>>    lll_futex_timed_wait (futexp, val, NULL, private)
>>  
>> @@ -205,7 +215,9 @@
>>  /* Set *futex to ID if it is 0, atomically.  Returns the old value */
>>  #define __lll_robust_trylock(futex, id) \
>>    ({ int __val;                                                                      \
>> -     __asm __volatile ("1:     lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>> +     if (!__lll_is_single_thread)                                            \
>> +       __asm __volatile (                                                    \
>> +                      "1:      lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>>                        "        cmpwi   0,%0,0\n"                             \
>>                        "        bne     2f\n"                                 \
>>                        "        stwcx.  %3,0,%2\n"                            \
>> @@ -214,6 +226,12 @@
>>                        : "=&r" (__val), "=m" (*futex)                         \
>>                        : "r" (futex), "r" (id), "m" (*futex)                  \
>>                        : "cr0", "memory");                                    \
>> +     else                                                                    \
>> +       {                                                                     \
>> +        __val = *futex;                                                      \
>> +        if (__val == 0)                                                      \
>> +          *futex = id;                                                       \
>> +       }                                                                     \
>>       __val;                                                                  \
>>    })
> Conceptually, you can safely use trylock in signal handlers, because
> it's not a blocking operation.  And this is used for normal and robust
> locks.  I haven't checked all the trylock code and all uses to see
> whether it is indeed AS-Safe, but I'm pretty sure this change is wrong.
> At the very least, it doesn't document that trylock is now AS-Unsafe.

If there is the case I concede to you, however I couldn't find any proper
documentation (either POSIX or from our manual) saying trylocks should be
AS-Safe.  And answering your previous question, I don't know if anyone is
tracking the remaining documentation about pthread.

>
> Regarding the robust locks case: What is using it outside of nptl?  And
> if so, what's the point of using an explicitly robust lock if there's no
> concurrency?  Who's going to clean up afterwards?

Indeed I see no use outside NPTL, I will drop it.

>
>> @@ -237,8 +255,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>>  #define lll_lock(lock, private) \
>>    (void) ({                                                                  \
>>      int *__futex = &(lock);                                                  \
>> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
>> -                         0) != 0)                                            \
>> +    int __tmp;                                                               \
>> +    if (!__lll_is_single_thread)                                             \
>> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);           \
>> +    else                                                                     \
>> +      {                                                                              \
>> +       __tmp = *__futex;                                                     \
>> +       if (__tmp == 0)                                                       \
>> +         *__futex = 1;                                                       \
>> +      }                                                                              \
>> +    if (__builtin_expect (__tmp, 0) != 0)                                    \
>>        {                                                                              \
>>         if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)       \
>>           __lll_lock_wait_private (__futex);                                  \
> This is obviously wrong for anything in nptl because you don't check
> private.  You defined lll_is_single_thread so that it's always 0 in
> nptl, so this works -- but it's not increasing the clarity of the code.
>
> You also assume that nobody outside of nptl will call lll_lock with
> private not set to LLL_PRIVATE.  This seems error-prone.  Or can you
> show why that's fine?
>
> Also, you add this code to all low-level locks independently of whether
> they are actually used outside of nptl or not.  Have you looked at
> libc_lock.h?  I'm not familiar with how non-nptl code uses the locks,
> but it seems it's calling nptl code, so the change wouldn't have any
> effect.

Indeed libc-lock primitives seems the best way to implement such change, thanks
for the point, I'll rebase the changes base on these.

>
> Finally, for a change as this whose effect on performance is
> non-obvious, I think we really need performance measurements.  We need
> to have microbenchmarks so that we can track the change of performance.
>
Although micro-benchmarks can provided (and they will prob be in that loop)
'this is not a real case'), for PowerPC avoid LL/SC instructions where they
are not required is always a gain.
  
Torvald Riegel May 5, 2014, 10:33 a.m. UTC | #5
On Fri, 2014-05-02 at 14:40 -0300, Adhemerval Zanella wrote:
> On 02-05-2014 12:42, Torvald Riegel wrote:
> > On Wed, 2014-04-30 at 10:57 -0300, Adhemerval Zanella wrote:
> >> +#else
> >> +# define __lll_is_single_thread (0)
> >> +#endif
> >> +
> >> +
> >>  #define lll_futex_wait(futexp, val, private) \
> >>    lll_futex_timed_wait (futexp, val, NULL, private)
> >>  
> >> @@ -205,7 +215,9 @@
> >>  /* Set *futex to ID if it is 0, atomically.  Returns the old value */
> >>  #define __lll_robust_trylock(futex, id) \
> >>    ({ int __val;                                                                      \
> >> -     __asm __volatile ("1:     lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
> >> +     if (!__lll_is_single_thread)                                            \
> >> +       __asm __volatile (                                                    \
> >> +                      "1:      lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
> >>                        "        cmpwi   0,%0,0\n"                             \
> >>                        "        bne     2f\n"                                 \
> >>                        "        stwcx.  %3,0,%2\n"                            \
> >> @@ -214,6 +226,12 @@
> >>                        : "=&r" (__val), "=m" (*futex)                         \
> >>                        : "r" (futex), "r" (id), "m" (*futex)                  \
> >>                        : "cr0", "memory");                                    \
> >> +     else                                                                    \
> >> +       {                                                                     \
> >> +        __val = *futex;                                                      \
> >> +        if (__val == 0)                                                      \
> >> +          *futex = id;                                                       \
> >> +       }                                                                     \
> >>       __val;                                                                  \
> >>    })
> > Conceptually, you can safely use trylock in signal handlers, because
> > it's not a blocking operation.  And this is used for normal and robust
> > locks.  I haven't checked all the trylock code and all uses to see
> > whether it is indeed AS-Safe, but I'm pretty sure this change is wrong.
> > At the very least, it doesn't document that trylock is now AS-Unsafe.
> 
> If there is the case I concede to you, however I couldn't find any proper
> documentation (either POSIX or from our manual) saying trylocks should be
> AS-Safe.

We haven't documented it AFAIK, but these are our internal locks, and
unlike lock() which blocks, trylock() won't block.  So, trylock's
semantics can be useful in signal handler contexts, and your are
removing this possibility.  At the very least, if we have no current
internal users of trylock in AS-Safe functions, then we could consider
this change.  But I think this needs wider consensus in the community.

Regarding POSIX, I'm not aware of any explicit mentioning of what's safe
in signal handlers.  However, trylock is synchronizing, and documented
as non-blocking, so one could consider it being AS-Safe.

> And answering your previous question, I don't know if anyone is
> tracking the remaining documentation about pthread.

Nor do I really, but if you make a change to the semantics, it's worth
documenting it even if there's no existing documentation.

> >
> > Finally, for a change as this whose effect on performance is
> > non-obvious, I think we really need performance measurements.  We need
> > to have microbenchmarks so that we can track the change of performance.
> >
> Although micro-benchmarks can provided (and they will prob be in that loop)
> 'this is not a real case'),

What do you mean?  Will you provide microbenchmarks later on, just some
for the atomics, or some for a specific use of the modified locks (e.g.,
malloc)?

> for PowerPC avoid LL/SC instructions where they
> are not required is always a gain.

I think we had consensus in the community that we want to track
performance, and base performance-focused changes on data that we can
reassess later on (e.g., for a future PowerPC generation).  If we want
to do that, then I do think this needs microbenchmarks.  Two options
come to mind:
* Plain RMWs or CASs to cached and uncached memory.  The best case for
the optimization, I suppose.
* Real use of the optimization in, say, malloc.  That's a more realistic
benchmark.

If the second doesn't show any significant performance differences
(e.g., because there's so much other stuff going on that this doesn't
matter), then maybe it's not worth it to maintain it.  We can only say
with some confidence what's the case after having numbers.
  
Will Newton May 6, 2014, 8:36 a.m. UTC | #6
On 2 May 2014 18:40, Adhemerval Zanella <azanella@linux.vnet.ibm.com> wrote:
> On 02-05-2014 12:42, Torvald Riegel wrote:
>> On Wed, 2014-04-30 at 10:57 -0300, Adhemerval Zanella wrote:
>>> This patch adds a single-thread optimization for locks used within
>>> libc.so.  For each lock operations it checks it the process has already
>>> spawned one thread and if not use non-atomic operations.  Other libraries
>>> (libpthread.so for instance) are unaffected by this change.
>>>
>>> This is a respin on my first patch to add such optimization, but now the
>>> code is focused only on lowlevellock.h, the atomic.h is untouched.
>>>
>>> Tested on powerpc32 and powerpc64.
>>>
>>> --
>>>
>>>         * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>>>         (__lll_is_single_thread): New macro to check if process has spawned
>>>         any threads.
>>>         (__lll_robust_trylock): Add optimization to avoid atomic operations in
>>>         single thread case.
>>>         (lll_lock): Likewise.
>>>         (lll_robust_lock): Likewise.
>>>         (lll_cond_lock): Likewise.
>>>         (lll_robust_cond_lock): Likewise.
>>>         (lll_timedlock): Likewise.
>>>         (lll_robust_timedlock): Likewise.
>>>         (lll_unlock): Likewise.
>>>         (lll_robust_unlock): Likewise.
>>>
>>> ---
>>>
>>> diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>>> index ab92c3f..38529a4 100644
>>> --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>>> +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
>>> @@ -76,6 +76,16 @@
>>>  # endif
>>>  #endif
>>>
>>> +/* For internal libc.so lock calls in single-thread process we use normal
>>> +   load/stores.  */
>>> +#if !defined NOT_IN_libc || defined UP
>>> +# define __lll_is_single_thread                                                      \
>>> +  __glibc_likely (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
>> I disagree that single-threaded execution should be the likely case.
>> There is a large body of existing single-threaded code.  But what's the
>> microbenchmarks or data that we base this decision on?
>
> Fair enough, in fact I think changing to add no branch prediction and let either
> the compiler or the chip to prediction would be better.
>
>>
>>> +#else
>>> +# define __lll_is_single_thread (0)
>>> +#endif
>>> +
>>> +
>>>  #define lll_futex_wait(futexp, val, private) \
>>>    lll_futex_timed_wait (futexp, val, NULL, private)
>>>
>>> @@ -205,7 +215,9 @@
>>>  /* Set *futex to ID if it is 0, atomically.  Returns the old value */
>>>  #define __lll_robust_trylock(futex, id) \
>>>    ({ int __val;                                                                      \
>>> -     __asm __volatile ("1:     lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>>> +     if (!__lll_is_single_thread)                                            \
>>> +       __asm __volatile (                                                    \
>>> +                      "1:      lwarx   %0,0,%2" MUTEX_HINT_ACQ "\n"          \
>>>                        "        cmpwi   0,%0,0\n"                             \
>>>                        "        bne     2f\n"                                 \
>>>                        "        stwcx.  %3,0,%2\n"                            \
>>> @@ -214,6 +226,12 @@
>>>                        : "=&r" (__val), "=m" (*futex)                         \
>>>                        : "r" (futex), "r" (id), "m" (*futex)                  \
>>>                        : "cr0", "memory");                                    \
>>> +     else                                                                    \
>>> +       {                                                                     \
>>> +        __val = *futex;                                                      \
>>> +        if (__val == 0)                                                      \
>>> +          *futex = id;                                                       \
>>> +       }                                                                     \
>>>       __val;                                                                  \
>>>    })
>> Conceptually, you can safely use trylock in signal handlers, because
>> it's not a blocking operation.  And this is used for normal and robust
>> locks.  I haven't checked all the trylock code and all uses to see
>> whether it is indeed AS-Safe, but I'm pretty sure this change is wrong.
>> At the very least, it doesn't document that trylock is now AS-Unsafe.
>
> If there is the case I concede to you, however I couldn't find any proper
> documentation (either POSIX or from our manual) saying trylocks should be
> AS-Safe.  And answering your previous question, I don't know if anyone is
> tracking the remaining documentation about pthread.
>
>>
>> Regarding the robust locks case: What is using it outside of nptl?  And
>> if so, what's the point of using an explicitly robust lock if there's no
>> concurrency?  Who's going to clean up afterwards?
>
> Indeed I see no use outside NPTL, I will drop it.
>
>>
>>> @@ -237,8 +255,16 @@ extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
>>>  #define lll_lock(lock, private) \
>>>    (void) ({                                                                  \
>>>      int *__futex = &(lock);                                                  \
>>> -    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
>>> -                         0) != 0)                                            \
>>> +    int __tmp;                                                               \
>>> +    if (!__lll_is_single_thread)                                             \
>>> +      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);           \
>>> +    else                                                                     \
>>> +      {                                                                              \
>>> +       __tmp = *__futex;                                                     \
>>> +       if (__tmp == 0)                                                       \
>>> +         *__futex = 1;                                                       \
>>> +      }                                                                              \
>>> +    if (__builtin_expect (__tmp, 0) != 0)                                    \
>>>        {                                                                              \
>>>         if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)       \
>>>           __lll_lock_wait_private (__futex);                                  \
>> This is obviously wrong for anything in nptl because you don't check
>> private.  You defined lll_is_single_thread so that it's always 0 in
>> nptl, so this works -- but it's not increasing the clarity of the code.
>>
>> You also assume that nobody outside of nptl will call lll_lock with
>> private not set to LLL_PRIVATE.  This seems error-prone.  Or can you
>> show why that's fine?
>>
>> Also, you add this code to all low-level locks independently of whether
>> they are actually used outside of nptl or not.  Have you looked at
>> libc_lock.h?  I'm not familiar with how non-nptl code uses the locks,
>> but it seems it's calling nptl code, so the change wouldn't have any
>> effect.
>
> Indeed libc-lock primitives seems the best way to implement such change, thanks
> for the point, I'll rebase the changes base on these.
>
>>
>> Finally, for a change as this whose effect on performance is
>> non-obvious, I think we really need performance measurements.  We need
>> to have microbenchmarks so that we can track the change of performance.
>>
> Although micro-benchmarks can provided (and they will prob be in that loop)
> 'this is not a real case'), for PowerPC avoid LL/SC instructions where they
> are not required is always a gain.

I recently posted a malloc micro-benchmark that should benefit from
these changes.
  

Patch

diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
index ab92c3f..38529a4 100644
--- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
+++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
@@ -76,6 +76,16 @@ 
 # endif
 #endif
 
+/* For internal libc.so lock calls in single-thread process we use normal
+   load/stores.  */
+#if !defined NOT_IN_libc || defined UP
+# define __lll_is_single_thread						      \
+  __glibc_likely (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
+#else
+# define __lll_is_single_thread (0)
+#endif
+
+
 #define lll_futex_wait(futexp, val, private) \
   lll_futex_timed_wait (futexp, val, NULL, private)
 
@@ -205,7 +215,9 @@ 
 /* Set *futex to ID if it is 0, atomically.  Returns the old value */
 #define __lll_robust_trylock(futex, id) \
   ({ int __val;								      \
-     __asm __volatile ("1:	lwarx	%0,0,%2" MUTEX_HINT_ACQ "\n"	      \
+     if (!__lll_is_single_thread)					      \
+       __asm __volatile (						      \
+		       "1:	lwarx	%0,0,%2" MUTEX_HINT_ACQ "\n"	      \
 		       "	cmpwi	0,%0,0\n"			      \
 		       "	bne	2f\n"				      \
 		       "	stwcx.	%3,0,%2\n"			      \
@@ -214,6 +226,12 @@ 
 		       : "=&r" (__val), "=m" (*futex)			      \
 		       : "r" (futex), "r" (id), "m" (*futex)		      \
 		       : "cr0", "memory");				      \
+     else								      \
+       {								      \
+	 __val = *futex;						      \
+	 if (__val == 0)						      \
+	   *futex = id;							      \
+       }								      \
      __val;								      \
   })
 
@@ -237,8 +255,16 @@  extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
 #define lll_lock(lock, private) \
   (void) ({								      \
     int *__futex = &(lock);						      \
-    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
-			  0) != 0)					      \
+    int __tmp;								      \
+    if (!__lll_is_single_thread)					      \
+      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);	      \
+    else								      \
+      {									      \
+	__tmp = *__futex;						      \
+	if (__tmp == 0)							      \
+	  *__futex = 1;							      \
+      }									      \
+    if (__builtin_expect (__tmp, 0) != 0)				      \
       {									      \
 	if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)	      \
 	  __lll_lock_wait_private (__futex);				      \
@@ -251,8 +277,16 @@  extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
   ({									      \
     int *__futex = &(lock);						      \
     int __val = 0;							      \
-    if (__builtin_expect (atomic_compare_and_exchange_bool_acq (__futex, id,  \
-								0), 0))	      \
+    int __tmp;								      \
+    if (!__lll_is_single_thread)					      \
+      __tmp = atomic_compare_and_exchange_bool_acq (__futex, id, 0);	      \
+    else								      \
+      {									      \
+	__tmp = (*__futex == 0);					      \
+	if (__tmp)							      \
+	  *__futex = id;						      \
+      }									      \
+    if (__builtin_expect (__tmp, 0))					      \
       __val = __lll_robust_lock_wait (__futex, private);		      \
     __val;								      \
   })
@@ -260,8 +294,16 @@  extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
 #define lll_cond_lock(lock, private) \
   (void) ({								      \
     int *__futex = &(lock);						      \
-    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 2, 0),\
-			  0) != 0)					      \
+    int __tmp;								      \
+    if (!__lll_is_single_thread)					      \
+      __tmp = atomic_compare_and_exchange_val_acq (__futex, 2, 0);	      \
+    else								      \
+      {									      \
+	__tmp = *__futex;						      \
+	if (__tmp == 0)							      \
+	  *__futex = 2;							      \
+      }									      \
+    if (__builtin_expect (__tmp, 0) != 0)				      \
       __lll_lock_wait (__futex, private);				      \
   })
 
@@ -269,9 +311,17 @@  extern int __lll_robust_lock_wait (int *futex, int private) attribute_hidden;
   ({									      \
     int *__futex = &(lock);						      \
     int __val = 0;							      \
+    int __tmp;								      \
     int __id = id | FUTEX_WAITERS;					      \
-    if (__builtin_expect (atomic_compare_and_exchange_bool_acq (__futex, __id,\
-								0), 0))	      \
+    if (!__lll_is_single_thread)					      \
+      __tmp = atomic_compare_and_exchange_bool_acq (__futex, __id, 0);	      \
+    else								      \
+      {									      \
+	__tmp = (*__futex == 0);					      \
+	if (__tmp)							      \
+	  *__futex = id;						      \
+      }									      \
+    if (__builtin_expect (__tmp, 0))					      \
       __val = __lll_robust_lock_wait (__futex, private);		      \
     __val;								      \
   })
@@ -286,8 +336,16 @@  extern int __lll_robust_timedlock_wait
   ({									      \
     int *__futex = &(lock);						      \
     int __val = 0;							      \
-    if (__builtin_expect (atomic_compare_and_exchange_val_acq (__futex, 1, 0),\
-			  0) != 0)					      \
+    int __tmp;								      \
+    if (!__lll_is_single_thread)					      \
+      __tmp = atomic_compare_and_exchange_val_acq (__futex, 1, 0);	      \
+    else								      \
+      {									      \
+	__tmp = *__futex;						      \
+	if (__tmp == 0)							      \
+	  *__futex = 1;							      \
+      }									      \
+    if (__builtin_expect (__tmp, 0) != 0)				      \
       __val = __lll_timedlock_wait (__futex, abstime, private);		      \
     __val;								      \
   })
@@ -296,8 +354,16 @@  extern int __lll_robust_timedlock_wait
   ({									      \
     int *__futex = &(lock);						      \
     int __val = 0;							      \
-    if (__builtin_expect (atomic_compare_and_exchange_bool_acq (__futex, id,  \
-								0), 0))	      \
+    int __tmp;								      \
+    if (!__lll_is_single_thread)					      \
+      __tmp = atomic_compare_and_exchange_bool_acq (__futex, id, 0);	      \
+    else								      \
+      {									      \
+	__tmp = (*__futex == 0);					      \
+	if (__tmp)							      \
+	  *__futex = id;						      \
+      }									      \
+    if (__builtin_expect (__tmp, 0))					      \
       __val = __lll_robust_timedlock_wait (__futex, abstime, private);	      \
     __val;								      \
   })
@@ -305,7 +371,14 @@  extern int __lll_robust_timedlock_wait
 #define lll_unlock(lock, private) \
   ((void) ({								      \
     int *__futex = &(lock);						      \
-    int __val = atomic_exchange_rel (__futex, 0);			      \
+    int __val;								      \
+    if (!__lll_is_single_thread)					      \
+      __val = atomic_exchange_rel (__futex, 0);				      \
+    else								      \
+      {									      \
+	__val = *__futex;						      \
+	*__futex = 0;							      \
+      }									      \
     if (__glibc_unlikely (__val > 1))					      \
       lll_futex_wake (__futex, 1, private);				      \
   }))
@@ -313,7 +386,14 @@  extern int __lll_robust_timedlock_wait
 #define lll_robust_unlock(lock, private) \
   ((void) ({								      \
     int *__futex = &(lock);						      \
-    int __val = atomic_exchange_rel (__futex, 0);			      \
+    int __val;								      \
+    if (!__lll_is_single_thread)					      \
+      __val = atomic_exchange_rel (__futex, 0);				      \
+    else								      \
+      {									      \
+	__val = *__futex;						      \
+	*__futex = 0;							      \
+      }									      \
     if (__glibc_unlikely (__val & FUTEX_WAITERS))			      \
       lll_futex_wake (__futex, 1, private);				      \
   }))