Patchwork [2/4] Small optimization for lowlevellock

login
register
mail settings
Submitter Adhemerval Zanella Netto
Date Feb. 22, 2019, 7:27 p.m.
Message ID <20190222192703.18177-2-adhemerval.zanella@linaro.org>
Download mbox | patch
Permalink /patch/31559/
State New
Headers show

Comments

Adhemerval Zanella Netto - Feb. 22, 2019, 7:27 p.m.
This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
by issuing only one lll_futex_wait.  Since it is defined as an inlined
syscall and inlined syscalls are defined using inlined assembly the
compiler usually can not see both calls are equal and optimize
accordingly.

On aarch64 the resulting binary is change from:

0000000000000060 <__lll_lock_wait>:
  60:   2a0103e5        mov     w5, w1
  64:   b9400001        ldr     w1, [x0]
  68:   aa0003e4        mov     x4, x0
  6c:   7100083f        cmp     w1, #0x2
  70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
  74:   521900a1        eor     w1, w5, #0x80
  78:   d2800042        mov     x2, #0x2                        // #2
  7c:   93407c21        sxtw    x1, w1
  80:   d2800003        mov     x3, #0x0                        // #0
  84:   d2800c48        mov     x8, #0x62                       // #98
  88:   d4000001        svc     #0x0
  8c:   521900a5        eor     w5, w5, #0x80
  90:   52800046        mov     w6, #0x2                        // #2
  94:   93407ca5        sxtw    x5, w5
  98:   14000008        b       b8 <__lll_lock_wait+0x58>
  9c:   d503201f        nop
  a0:   aa0403e0        mov     x0, x4
  a4:   aa0503e1        mov     x1, x5
  a8:   d2800042        mov     x2, #0x2                        // #2
  ac:   d2800003        mov     x3, #0x0                        // #0
  b0:   d2800c48        mov     x8, #0x62                       // #98
  b4:   d4000001        svc     #0x0
  b8:   885ffc80        ldaxr   w0, [x4]
  bc:   88017c86        stxr    w1, w6, [x4]
  c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
  c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
  c8:   d65f03c0        ret

To:

0000000000000048 <__lll_lock_wait>:
  48:   aa0003e4        mov     x4, x0
  4c:   2a0103e5        mov     w5, w1
  50:   b9400000        ldr     w0, [x0]
  54:   7100081f        cmp     w0, #0x2
  58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
  5c:   52800041        mov     w1, #0x2                        // #2
  60:   885ffc80        ldaxr   w0, [x4]
  64:   88027c81        stxr    w2, w1, [x4]
  68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
  6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
  70:   521900a1        eor     w1, w5, #0x80
  74:   aa0403e0        mov     x0, x4
  78:   93407c21        sxtw    x1, w1
  7c:   d2800042        mov     x2, #0x2                        // #2
  80:   d2800003        mov     x3, #0x0                        // #0
  84:   d2800c48        mov     x8, #0x62                       // #98
  88:   d4000001        svc     #0x0
  8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
  90:   d65f03c0        ret

I see similar changes on powerpc and other architectures.  It also aligns
with x86_64 implementation by adding the systemtap probes.

Checker on aarch64-linux-gnu.

	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
	Optimize futex call and add systemtap probe.
---
 nptl/lowlevellock.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)
Adhemerval Zanella Netto - April 9, 2019, 12:11 p.m.
Ping.

On 22/02/2019 16:27, Adhemerval Zanella wrote:
> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
> by issuing only one lll_futex_wait.  Since it is defined as an inlined
> syscall and inlined syscalls are defined using inlined assembly the
> compiler usually can not see both calls are equal and optimize
> accordingly.
> 
> On aarch64 the resulting binary is change from:
> 
> 0000000000000060 <__lll_lock_wait>:
>   60:   2a0103e5        mov     w5, w1
>   64:   b9400001        ldr     w1, [x0]
>   68:   aa0003e4        mov     x4, x0
>   6c:   7100083f        cmp     w1, #0x2
>   70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
>   74:   521900a1        eor     w1, w5, #0x80
>   78:   d2800042        mov     x2, #0x2                        // #2
>   7c:   93407c21        sxtw    x1, w1
>   80:   d2800003        mov     x3, #0x0                        // #0
>   84:   d2800c48        mov     x8, #0x62                       // #98
>   88:   d4000001        svc     #0x0
>   8c:   521900a5        eor     w5, w5, #0x80
>   90:   52800046        mov     w6, #0x2                        // #2
>   94:   93407ca5        sxtw    x5, w5
>   98:   14000008        b       b8 <__lll_lock_wait+0x58>
>   9c:   d503201f        nop
>   a0:   aa0403e0        mov     x0, x4
>   a4:   aa0503e1        mov     x1, x5
>   a8:   d2800042        mov     x2, #0x2                        // #2
>   ac:   d2800003        mov     x3, #0x0                        // #0
>   b0:   d2800c48        mov     x8, #0x62                       // #98
>   b4:   d4000001        svc     #0x0
>   b8:   885ffc80        ldaxr   w0, [x4]
>   bc:   88017c86        stxr    w1, w6, [x4]
>   c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
>   c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
>   c8:   d65f03c0        ret
> 
> To:
> 
> 0000000000000048 <__lll_lock_wait>:
>   48:   aa0003e4        mov     x4, x0
>   4c:   2a0103e5        mov     w5, w1
>   50:   b9400000        ldr     w0, [x0]
>   54:   7100081f        cmp     w0, #0x2
>   58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
>   5c:   52800041        mov     w1, #0x2                        // #2
>   60:   885ffc80        ldaxr   w0, [x4]
>   64:   88027c81        stxr    w2, w1, [x4]
>   68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
>   6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
>   70:   521900a1        eor     w1, w5, #0x80
>   74:   aa0403e0        mov     x0, x4
>   78:   93407c21        sxtw    x1, w1
>   7c:   d2800042        mov     x2, #0x2                        // #2
>   80:   d2800003        mov     x3, #0x0                        // #0
>   84:   d2800c48        mov     x8, #0x62                       // #98
>   88:   d4000001        svc     #0x0
>   8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
>   90:   d65f03c0        ret
> 
> I see similar changes on powerpc and other architectures.  It also aligns
> with x86_64 implementation by adding the systemtap probes.
> 
> Checker on aarch64-linux-gnu.
> 
> 	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
> 	Optimize futex call and add systemtap probe.
> ---
>  nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>  1 file changed, 19 insertions(+), 12 deletions(-)
> 
> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
> index 5eaa3807ea..47548ff121 100644
> --- a/nptl/lowlevellock.c
> +++ b/nptl/lowlevellock.c
> @@ -17,20 +17,23 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <errno.h>
>  #include <sysdep.h>
>  #include <lowlevellock.h>
> -#include <sys/time.h>
>  #include <atomic.h>
> +#include <stap-probe.h>
>  
>  void
>  __lll_lock_wait_private (int *futex)
>  {
> -  if (*futex == 2)
> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
> -
> -  while (atomic_exchange_acq (futex, 2) != 0)
> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
> +  if (atomic_load_relaxed (futex) == 2)
> +    goto futex;
> +
> +  while (atomic_exchange_acquire (futex, 2) != 0)
> +    {
> +    futex:
> +      LIBC_PROBE (lll_lock_wait_private, 1, futex);
> +      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
> +    }
>  }
>  
>  
> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>  void
>  __lll_lock_wait (int *futex, int private)
>  {
> -  if (*futex == 2)
> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
> -
> -  while (atomic_exchange_acq (futex, 2) != 0)
> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
> +  if (atomic_load_relaxed (futex) == 2)
> +    goto futex;
> +
> +  while (atomic_exchange_acquire (futex, 2) != 0)
> +    {
> +    futex:
> +      LIBC_PROBE (lll_lock_wait, 1, futex);
> +      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
> +    }
>  }
>  #endif
>
Adhemerval Zanella Netto - May 9, 2019, 6:25 p.m.
Ping (x2). The third and forth part are already approved.

On 09/04/2019 09:11, Adhemerval Zanella wrote:
> Ping.
> 
> On 22/02/2019 16:27, Adhemerval Zanella wrote:
>> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
>> by issuing only one lll_futex_wait.  Since it is defined as an inlined
>> syscall and inlined syscalls are defined using inlined assembly the
>> compiler usually can not see both calls are equal and optimize
>> accordingly.
>>
>> On aarch64 the resulting binary is change from:
>>
>> 0000000000000060 <__lll_lock_wait>:
>>   60:   2a0103e5        mov     w5, w1
>>   64:   b9400001        ldr     w1, [x0]
>>   68:   aa0003e4        mov     x4, x0
>>   6c:   7100083f        cmp     w1, #0x2
>>   70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
>>   74:   521900a1        eor     w1, w5, #0x80
>>   78:   d2800042        mov     x2, #0x2                        // #2
>>   7c:   93407c21        sxtw    x1, w1
>>   80:   d2800003        mov     x3, #0x0                        // #0
>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>   88:   d4000001        svc     #0x0
>>   8c:   521900a5        eor     w5, w5, #0x80
>>   90:   52800046        mov     w6, #0x2                        // #2
>>   94:   93407ca5        sxtw    x5, w5
>>   98:   14000008        b       b8 <__lll_lock_wait+0x58>
>>   9c:   d503201f        nop
>>   a0:   aa0403e0        mov     x0, x4
>>   a4:   aa0503e1        mov     x1, x5
>>   a8:   d2800042        mov     x2, #0x2                        // #2
>>   ac:   d2800003        mov     x3, #0x0                        // #0
>>   b0:   d2800c48        mov     x8, #0x62                       // #98
>>   b4:   d4000001        svc     #0x0
>>   b8:   885ffc80        ldaxr   w0, [x4]
>>   bc:   88017c86        stxr    w1, w6, [x4]
>>   c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
>>   c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
>>   c8:   d65f03c0        ret
>>
>> To:
>>
>> 0000000000000048 <__lll_lock_wait>:
>>   48:   aa0003e4        mov     x4, x0
>>   4c:   2a0103e5        mov     w5, w1
>>   50:   b9400000        ldr     w0, [x0]
>>   54:   7100081f        cmp     w0, #0x2
>>   58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
>>   5c:   52800041        mov     w1, #0x2                        // #2
>>   60:   885ffc80        ldaxr   w0, [x4]
>>   64:   88027c81        stxr    w2, w1, [x4]
>>   68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
>>   6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
>>   70:   521900a1        eor     w1, w5, #0x80
>>   74:   aa0403e0        mov     x0, x4
>>   78:   93407c21        sxtw    x1, w1
>>   7c:   d2800042        mov     x2, #0x2                        // #2
>>   80:   d2800003        mov     x3, #0x0                        // #0
>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>   88:   d4000001        svc     #0x0
>>   8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
>>   90:   d65f03c0        ret
>>
>> I see similar changes on powerpc and other architectures.  It also aligns
>> with x86_64 implementation by adding the systemtap probes.
>>
>> Checker on aarch64-linux-gnu.
>>
>> 	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
>> 	Optimize futex call and add systemtap probe.
>> ---
>>  nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>>  1 file changed, 19 insertions(+), 12 deletions(-)
>>
>> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
>> index 5eaa3807ea..47548ff121 100644
>> --- a/nptl/lowlevellock.c
>> +++ b/nptl/lowlevellock.c
>> @@ -17,20 +17,23 @@
>>     License along with the GNU C Library; if not, see
>>     <http://www.gnu.org/licenses/>.  */
>>  
>> -#include <errno.h>
>>  #include <sysdep.h>
>>  #include <lowlevellock.h>
>> -#include <sys/time.h>
>>  #include <atomic.h>
>> +#include <stap-probe.h>
>>  
>>  void
>>  __lll_lock_wait_private (int *futex)
>>  {
>> -  if (*futex == 2)
>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>> -
>> -  while (atomic_exchange_acq (futex, 2) != 0)
>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>> +  if (atomic_load_relaxed (futex) == 2)
>> +    goto futex;
>> +
>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>> +    {
>> +    futex:
>> +      LIBC_PROBE (lll_lock_wait_private, 1, futex);
>> +      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>> +    }
>>  }
>>  
>>  
>> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>>  void
>>  __lll_lock_wait (int *futex, int private)
>>  {
>> -  if (*futex == 2)
>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>> -
>> -  while (atomic_exchange_acq (futex, 2) != 0)
>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>> +  if (atomic_load_relaxed (futex) == 2)
>> +    goto futex;
>> +
>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>> +    {
>> +    futex:
>> +      LIBC_PROBE (lll_lock_wait, 1, futex);
>> +      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>> +    }
>>  }
>>  #endif
>>
Carlos O'Donell - May 9, 2019, 6:33 p.m.
On 5/9/19 2:25 PM, Adhemerval Zanella wrote:
> Ping (x2). The third and forth part are already approved.

This looks good to me.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>

> On 09/04/2019 09:11, Adhemerval Zanella wrote:
>> Ping.
>>
>> On 22/02/2019 16:27, Adhemerval Zanella wrote:
>>> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait
>>> by issuing only one lll_futex_wait.  Since it is defined as an inlined
>>> syscall and inlined syscalls are defined using inlined assembly the
>>> compiler usually can not see both calls are equal and optimize
>>> accordingly.

Right.

>>>
>>> On aarch64 the resulting binary is change from:
>>>
>>> 0000000000000060 <__lll_lock_wait>:
>>>   60:   2a0103e5        mov     w5, w1
>>>   64:   b9400001        ldr     w1, [x0]
>>>   68:   aa0003e4        mov     x4, x0
>>>   6c:   7100083f        cmp     w1, #0x2
>>>   70:   540000e1        b.ne    8c <__lll_lock_wait+0x2c>  // b.any
>>>   74:   521900a1        eor     w1, w5, #0x80
>>>   78:   d2800042        mov     x2, #0x2                        // #2
>>>   7c:   93407c21        sxtw    x1, w1
>>>   80:   d2800003        mov     x3, #0x0                        // #0
>>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>>   88:   d4000001        svc     #0x0
>>>   8c:   521900a5        eor     w5, w5, #0x80
>>>   90:   52800046        mov     w6, #0x2                        // #2
>>>   94:   93407ca5        sxtw    x5, w5
>>>   98:   14000008        b       b8 <__lll_lock_wait+0x58>
>>>   9c:   d503201f        nop
>>>   a0:   aa0403e0        mov     x0, x4
>>>   a4:   aa0503e1        mov     x1, x5
>>>   a8:   d2800042        mov     x2, #0x2                        // #2
>>>   ac:   d2800003        mov     x3, #0x0                        // #0
>>>   b0:   d2800c48        mov     x8, #0x62                       // #98
>>>   b4:   d4000001        svc     #0x0
>>>   b8:   885ffc80        ldaxr   w0, [x4]
>>>   bc:   88017c86        stxr    w1, w6, [x4]
>>>   c0:   35ffffc1        cbnz    w1, b8 <__lll_lock_wait+0x58>
>>>   c4:   35fffee0        cbnz    w0, a0 <__lll_lock_wait+0x40>
>>>   c8:   d65f03c0        ret
>>>
>>> To:
>>>
>>> 0000000000000048 <__lll_lock_wait>:
>>>   48:   aa0003e4        mov     x4, x0
>>>   4c:   2a0103e5        mov     w5, w1
>>>   50:   b9400000        ldr     w0, [x0]
>>>   54:   7100081f        cmp     w0, #0x2
>>>   58:   540000c0        b.eq    70 <__lll_lock_wait+0x28>  // b.none
>>>   5c:   52800041        mov     w1, #0x2                        // #2
>>>   60:   885ffc80        ldaxr   w0, [x4]
>>>   64:   88027c81        stxr    w2, w1, [x4]
>>>   68:   35ffffc2        cbnz    w2, 60 <__lll_lock_wait+0x18>
>>>   6c:   34000120        cbz     w0, 90 <__lll_lock_wait+0x48>
>>>   70:   521900a1        eor     w1, w5, #0x80
>>>   74:   aa0403e0        mov     x0, x4
>>>   78:   93407c21        sxtw    x1, w1
>>>   7c:   d2800042        mov     x2, #0x2                        // #2
>>>   80:   d2800003        mov     x3, #0x0                        // #0
>>>   84:   d2800c48        mov     x8, #0x62                       // #98
>>>   88:   d4000001        svc     #0x0
>>>   8c:   17fffff4        b       5c <__lll_lock_wait+0x14>
>>>   90:   d65f03c0        ret
>>>

Nice!

>>> I see similar changes on powerpc and other architectures.  It also aligns
>>> with x86_64 implementation by adding the systemtap probes.
>>>
>>> Checker on aarch64-linux-gnu.
>>>
>>> 	* nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private):
>>> 	Optimize futex call and add systemtap probe.
>>> ---
>>>  nptl/lowlevellock.c | 31 +++++++++++++++++++------------
>>>  1 file changed, 19 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
>>> index 5eaa3807ea..47548ff121 100644
>>> --- a/nptl/lowlevellock.c
>>> +++ b/nptl/lowlevellock.c
>>> @@ -17,20 +17,23 @@
>>>     License along with the GNU C Library; if not, see
>>>     <http://www.gnu.org/licenses/>.  */
>>>  
>>> -#include <errno.h>
>>>  #include <sysdep.h>
>>>  #include <lowlevellock.h>
>>> -#include <sys/time.h>
>>>  #include <atomic.h>
>>> +#include <stap-probe.h>

OK.

>>>  
>>>  void
>>>  __lll_lock_wait_private (int *futex)
>>>  {
>>> -  if (*futex == 2)
>>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
>>> -
>>> -  while (atomic_exchange_acq (futex, 2) != 0)
>>> -    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */

OK.

>>> +  if (atomic_load_relaxed (futex) == 2)
>>> +    goto futex;
>>> +
>>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>>> +    {
>>> +    futex:
>>> +      LIBC_PROBE (lll_lock_wait_private, 1, futex);
>>> +      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */

OK.

>>> +    }
>>>  }
>>>  
>>>  
>>> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex)
>>>  void
>>>  __lll_lock_wait (int *futex, int private)
>>>  {
>>> -  if (*futex == 2)
>>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>>> -
>>> -  while (atomic_exchange_acq (futex, 2) != 0)
>>> -    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
>>> +  if (atomic_load_relaxed (futex) == 2)
>>> +    goto futex;

OK.

>>> +
>>> +  while (atomic_exchange_acquire (futex, 2) != 0)
>>> +    {
>>> +    futex:
>>> +      LIBC_PROBE (lll_lock_wait, 1, futex);
>>> +      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */

OK.

>>> +    }
>>>  }
>>>  #endif
>>>

Patch

diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
index 5eaa3807ea..47548ff121 100644
--- a/nptl/lowlevellock.c
+++ b/nptl/lowlevellock.c
@@ -17,20 +17,23 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <errno.h>
 #include <sysdep.h>
 #include <lowlevellock.h>
-#include <sys/time.h>
 #include <atomic.h>
+#include <stap-probe.h>
 
 void
 __lll_lock_wait_private (int *futex)
 {
-  if (*futex == 2)
-    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
-
-  while (atomic_exchange_acq (futex, 2) != 0)
-    lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
+  if (atomic_load_relaxed (futex) == 2)
+    goto futex;
+
+  while (atomic_exchange_acquire (futex, 2) != 0)
+    {
+    futex:
+      LIBC_PROBE (lll_lock_wait_private, 1, futex);
+      lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
+    }
 }
 
 
@@ -39,10 +42,14 @@  __lll_lock_wait_private (int *futex)
 void
 __lll_lock_wait (int *futex, int private)
 {
-  if (*futex == 2)
-    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
-
-  while (atomic_exchange_acq (futex, 2) != 0)
-    lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
+  if (atomic_load_relaxed (futex) == 2)
+    goto futex;
+
+  while (atomic_exchange_acquire (futex, 2) != 0)
+    {
+    futex:
+      LIBC_PROBE (lll_lock_wait, 1, futex);
+      lll_futex_wait (futex, 2, private); /* Wait if *futex == 2.  */
+    }
 }
 #endif