diff mbox

[PATCHv3] powerpc: Fix write-after-destroy in lock elision

Message ID	1482343384-28430-1-git-send-email-tuliom@linux.vnet.ibm.com
State	Superseded
Delegated to:	Torvald Riegel
Headers	Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org From: "Tulio Magno Quites Machado Filho" <tuliom@linux.vnet.ibm.com> To: libc-alpha@sourceware.org Cc: vapier@gentoo.org, raji@linux.vnet.ibm.com, triegel@redhat.com, munroesj@linux.vnet.ibm.com Subject: [PATCHv3] powerpc: Fix write-after-destroy in lock elision Date: Wed, 21 Dec 2016 16:03:04 -0200 In-Reply-To: <20161212043646.GL10558@vapier.lan> References: <20161212043646.GL10558@vapier.lan> Message-Id: <1482343384-28430-1-git-send-email-tuliom@linux.vnet.ibm.com>

Commit Message

Tulio Magno Quites Machado Filho Dec. 21, 2016, 6:03 p.m. UTC

  From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>

Changes since v2:
 - Fixed coding style.

Changes since v1:
 - Removed references to data race by the actual error: write-after-destroy.
 - Enclosed adapt_count accesses in C11 atomics.

--- 8< ---

The update of *adapt_count after the release of the lock causes a race
condition when thread A unlocks, thread B continues and destroys the
mutex, and thread A writes to *adapt_count.

2016-12-21  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
	    Steven Munroe  <sjmunroe@us.ibm.com>
	    Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>

	[BZ #20822]
	* sysdeps/unix/sysv/linux/powerpc/elision-lock.c
	(__lll_lock_elision): Access adapt_count via C11 atomics.
	* sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
	(__lll_trylock_elision): Likewise.
	* sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
	(__lll_unlock_elision):  Update adapt_count variable inside the
	critical section.
---
 sysdeps/unix/sysv/linux/powerpc/elision-lock.c    |  8 +++++---
 sysdeps/unix/sysv/linux/powerpc/elision-trylock.c |  7 ++++---
 sysdeps/unix/sysv/linux/powerpc/elision-unlock.c  | 14 +++++++++-----
 3 files changed, 18 insertions(+), 11 deletions(-)

Comments

Tulio Magno Quites Machado Filho Jan. 2, 2017, 2:12 p.m. UTC | #1

Ping?

Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> writes:

> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> Changes since v2:
>  - Fixed coding style.
>
> Changes since v1:
>  - Removed references to data race by the actual error: write-after-destroy.
>  - Enclosed adapt_count accesses in C11 atomics.
>
> --- 8< ---
>
> The update of *adapt_count after the release of the lock causes a race
> condition when thread A unlocks, thread B continues and destroys the
> mutex, and thread A writes to *adapt_count.
>
> 2016-12-21  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
> 	    Steven Munroe  <sjmunroe@us.ibm.com>
> 	    Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>
>
> 	[BZ #20822]
> 	* sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> 	(__lll_lock_elision): Access adapt_count via C11 atomics.
> 	* sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> 	(__lll_trylock_elision): Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> 	(__lll_unlock_elision):  Update adapt_count variable inside the
> 	critical section.

Wainer dos Santos Moschetta Jan. 2, 2017, 4:19 p.m. UTC | #2

LGTM.

On 12/21/2016 04:03 PM, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> Changes since v2:
>  - Fixed coding style.
>
> Changes since v1:
>  - Removed references to data race by the actual error: write-after-destroy.
>  - Enclosed adapt_count accesses in C11 atomics.
>
> --- 8< ---
>
> The update of *adapt_count after the release of the lock causes a race
> condition when thread A unlocks, thread B continues and destroys the
> mutex, and thread A writes to *adapt_count.
>
> 2016-12-21  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
> 	    Steven Munroe  <sjmunroe@us.ibm.com>
> 	    Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>
>
> 	[BZ #20822]
> 	* sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> 	(__lll_lock_elision): Access adapt_count via C11 atomics.
> 	* sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> 	(__lll_trylock_elision): Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> 	(__lll_unlock_elision):  Update adapt_count variable inside the
> 	critical section.
> ---
>  sysdeps/unix/sysv/linux/powerpc/elision-lock.c    |  8 +++++---
>  sysdeps/unix/sysv/linux/powerpc/elision-trylock.c |  7 ++++---
>  sysdeps/unix/sysv/linux/powerpc/elision-unlock.c  | 14 +++++++++-----
>  3 files changed, 18 insertions(+), 11 deletions(-)
>
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> index dd1e4c3..86adbc9 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
> @@ -45,7 +45,7 @@
>  int
>  __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
>  {
> -  if (*adapt_count > 0)
> +  if (atomic_load_relaxed (adapt_count) > 0)
>      {
>        goto use_lock;
>      }
> @@ -67,7 +67,8 @@ __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
>  	  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
>  	    {
>  	      if (aconf.skip_lock_internal_abort > 0)
> -		*adapt_count = aconf.skip_lock_internal_abort;
> +		atomic_store_relaxed (adapt_count,
> +				      aconf.skip_lock_internal_abort);
>  	      goto use_lock;
>  	    }
>  	}
> @@ -75,7 +76,8 @@ __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
>
>    /* Fall back to locks for a bit if retries have been exhausted */
>    if (aconf.try_tbegin > 0 && aconf.skip_lock_out_of_tbegin_retries > 0)
> -    *adapt_count = aconf.skip_lock_out_of_tbegin_retries;
> +    atomic_store_relaxed (adapt_count,
> +			  aconf.skip_lock_out_of_tbegin_retries);
>
>  use_lock:
>    return LLL_LOCK ((*lock), pshared);
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> index 0807a6a..6061856 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
> @@ -34,7 +34,7 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>    __libc_tabort (_ABORT_NESTED_TRYLOCK);
>
>    /* Only try a transaction if it's worth it.  */
> -  if (*adapt_count > 0)
> +  if (atomic_load_relaxed (adapt_count) > 0)
>      {
>        goto use_lock;
>      }
> @@ -49,7 +49,7 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>        __libc_tend (0);
>
>        if (aconf.skip_lock_busy > 0)
> -	*adapt_count = aconf.skip_lock_busy;
> +	atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
>      }
>    else
>      {
> @@ -59,7 +59,8 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>  	     result in another failure.  Use normal locking now and
>  	     for the next couple of calls.  */
>  	  if (aconf.skip_trylock_internal_abort > 0)
> -	    *adapt_count = aconf.skip_trylock_internal_abort;
> +	    atomic_store_relaxed (adapt_count,
> +				aconf.skip_trylock_internal_abort);
>  	}
>      }
>
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> index 43c5a67..0e0baf5 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> @@ -28,13 +28,17 @@ __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
>      __libc_tend (0);
>    else
>      {
> -      lll_unlock ((*lock), pshared);
> -
> -      /* Update the adapt count AFTER completing the critical section.
> -         Doing this here prevents unneeded stalling when entering
> -         a critical section.  Saving about 8% runtime on P8.  */
> +      /* Update adapt_count in the critical section to prevent a
> +	 write-after-destroy error as mentioned in BZ 20822.  The
> +	 following update of adapt_count is clearly contained within
> +	 the critical region of the fall-back lock as it immediately
> +	 precedes the unlock.  Attempting to use C11 atomics to access
> +	 adapt_count would be (at minimum) misleading and (at worse) a
> +	 serious error forcing a larx-hit-stcx flush.  */
>        if (*adapt_count > 0)
>  	(*adapt_count)--;
> +
> +      lll_unlock ((*lock), pshared);
>      }
>    return 0;
>  }

Adhemerval Zanella Jan. 2, 2017, 5:03 p.m. UTC | #3

LGTM with a nit only on the comment about the adapt_count decrement in
unlock.

On 21/12/2016 16:03, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
> 
> Changes since v2:
>  - Fixed coding style.
> 
> Changes since v1:
>  - Removed references to data race by the actual error: write-after-destroy.
>  - Enclosed adapt_count accesses in C11 atomics.
> 
> --- 8< ---
> 
> The update of *adapt_count after the release of the lock causes a race
> condition when thread A unlocks, thread B continues and destroys the
> mutex, and thread A writes to *adapt_count.


>  
> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> index 43c5a67..0e0baf5 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> @@ -28,13 +28,17 @@ __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
>      __libc_tend (0);
>    else
>      {
> -      lll_unlock ((*lock), pshared);
> -
> -      /* Update the adapt count AFTER completing the critical section.
> -         Doing this here prevents unneeded stalling when entering
> -         a critical section.  Saving about 8% runtime on P8.  */
> +      /* Update adapt_count in the critical section to prevent a
> +	 write-after-destroy error as mentioned in BZ 20822.  The
> +	 following update of adapt_count is clearly contained within
> +	 the critical region of the fall-back lock as it immediately
> +	 precedes the unlock.  Attempting to use C11 atomics to access
> +	 adapt_count would be (at minimum) misleading and (at worse) a
> +	 serious error forcing a larx-hit-stcx flush.  */
>        if (*adapt_count > 0)
>  	(*adapt_count)--;

I think C11 atomic comment should outline that using atomic_fetch_add_relaxed
generates suboptimal code, not misleading, since it generates an atomic sequence
using lharx/sthcx. on a transaction region.  Something like:

      /* Update adapt_count in the critical section to prevent a
	 write-after-destroy error as mentioned in BZ 20822.  The
	 following update of adapt_count is clearly contained within
	 the critical region of the fall-back lock as it immediately
	 precedes the unlock.  
	 Using C11 atomics like atomic_fetch_add_relaxed generates
	 suboptimal code (lharx/sthcx. sequence) where an plain load, 
	 update, store is suffice inside a HTM region (the whole
	 idea of using this construction).  */

> +
> +      lll_unlock ((*lock), pshared);
>      }
>    return 0;
>  }
>

Torvald Riegel Jan. 2, 2017, 5:21 p.m. UTC | #4

On Mon, 2017-01-02 at 15:03 -0200, Adhemerval Zanella wrote:
> LGTM with a nit only on the comment about the adapt_count decrement in
> unlock.
> 
> On 21/12/2016 16:03, Tulio Magno Quites Machado Filho wrote:
> > From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
> > 
> > Changes since v2:
> >  - Fixed coding style.
> > 
> > Changes since v1:
> >  - Removed references to data race by the actual error: write-after-destroy.
> >  - Enclosed adapt_count accesses in C11 atomics.
> > 
> > --- 8< ---
> > 
> > The update of *adapt_count after the release of the lock causes a race
> > condition when thread A unlocks, thread B continues and destroys the
> > mutex, and thread A writes to *adapt_count.
> 
> 
> >  
> > diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> > index 43c5a67..0e0baf5 100644
> > --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> > +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> > @@ -28,13 +28,17 @@ __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
> >      __libc_tend (0);
> >    else
> >      {
> > -      lll_unlock ((*lock), pshared);
> > -
> > -      /* Update the adapt count AFTER completing the critical section.
> > -         Doing this here prevents unneeded stalling when entering
> > -         a critical section.  Saving about 8% runtime on P8.  */
> > +      /* Update adapt_count in the critical section to prevent a
> > +	 write-after-destroy error as mentioned in BZ 20822.  The
> > +	 following update of adapt_count is clearly contained within
> > +	 the critical region of the fall-back lock as it immediately
> > +	 precedes the unlock.  Attempting to use C11 atomics to access
> > +	 adapt_count would be (at minimum) misleading and (at worse) a
> > +	 serious error forcing a larx-hit-stcx flush.  */
> >        if (*adapt_count > 0)
> >  	(*adapt_count)--;
> 
> I think C11 atomic comment should outline that using atomic_fetch_add_relaxed
> generates suboptimal code, not misleading, since it generates an atomic sequence
> using lharx/sthcx. on a transaction region.  Something like:
> 
>       /* Update adapt_count in the critical section to prevent a
> 	 write-after-destroy error as mentioned in BZ 20822.  The
> 	 following update of adapt_count is clearly contained within
> 	 the critical region of the fall-back lock as it immediately
> 	 precedes the unlock.  
> 	 Using C11 atomics like atomic_fetch_add_relaxed generates
> 	 suboptimal code (lharx/sthcx. sequence) where an plain load, 
> 	 update, store is suffice inside a HTM region (the whole
> 	 idea of using this construction).  */

Yes.

For the record, I'm still of the opinion that it should just use atomic
relaxed MO loads and stores -- simply for consistency and to make future
transition to C11 atomic types easier.

Adhemerval Zanella Jan. 2, 2017, 5:25 p.m. UTC | #5

On 02/01/2017 15:21, Torvald Riegel wrote:
> On Mon, 2017-01-02 at 15:03 -0200, Adhemerval Zanella wrote:
>> LGTM with a nit only on the comment about the adapt_count decrement in
>> unlock.
>>
>> On 21/12/2016 16:03, Tulio Magno Quites Machado Filho wrote:
>>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>>
>>> Changes since v2:
>>>  - Fixed coding style.
>>>
>>> Changes since v1:
>>>  - Removed references to data race by the actual error: write-after-destroy.
>>>  - Enclosed adapt_count accesses in C11 atomics.
>>>
>>> --- 8< ---
>>>
>>> The update of *adapt_count after the release of the lock causes a race
>>> condition when thread A unlocks, thread B continues and destroys the
>>> mutex, and thread A writes to *adapt_count.
>>
>>
>>>  
>>> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
>>> index 43c5a67..0e0baf5 100644
>>> --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
>>> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
>>> @@ -28,13 +28,17 @@ __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
>>>      __libc_tend (0);
>>>    else
>>>      {
>>> -      lll_unlock ((*lock), pshared);
>>> -
>>> -      /* Update the adapt count AFTER completing the critical section.
>>> -         Doing this here prevents unneeded stalling when entering
>>> -         a critical section.  Saving about 8% runtime on P8.  */
>>> +      /* Update adapt_count in the critical section to prevent a
>>> +	 write-after-destroy error as mentioned in BZ 20822.  The
>>> +	 following update of adapt_count is clearly contained within
>>> +	 the critical region of the fall-back lock as it immediately
>>> +	 precedes the unlock.  Attempting to use C11 atomics to access
>>> +	 adapt_count would be (at minimum) misleading and (at worse) a
>>> +	 serious error forcing a larx-hit-stcx flush.  */
>>>        if (*adapt_count > 0)
>>>  	(*adapt_count)--;
>>
>> I think C11 atomic comment should outline that using atomic_fetch_add_relaxed
>> generates suboptimal code, not misleading, since it generates an atomic sequence
>> using lharx/sthcx. on a transaction region.  Something like:
>>
>>       /* Update adapt_count in the critical section to prevent a
>> 	 write-after-destroy error as mentioned in BZ 20822.  The
>> 	 following update of adapt_count is clearly contained within
>> 	 the critical region of the fall-back lock as it immediately
>> 	 precedes the unlock.  
>> 	 Using C11 atomics like atomic_fetch_add_relaxed generates
>> 	 suboptimal code (lharx/sthcx. sequence) where an plain load, 
>> 	 update, store is suffice inside a HTM region (the whole
>> 	 idea of using this construction).  */
> 
> Yes.
> 
> For the record, I'm still of the opinion that it should just use atomic
> relaxed MO loads and stores -- simply for consistency and to make future
> transition to C11 atomic types easier.
> 

I think it can be changed to:

       if (atomic_load_relaxed (adapt_count) > 0)
  	 (*adapt_count)--;

Without performance issues and the comment will outline why not C11 atomic is
not being used in this specific snippet.

Torvald Riegel Jan. 2, 2017, 5:39 p.m. UTC | #6

On Mon, 2017-01-02 at 15:25 -0200, Adhemerval Zanella wrote:
> 
> On 02/01/2017 15:21, Torvald Riegel wrote:
> > On Mon, 2017-01-02 at 15:03 -0200, Adhemerval Zanella wrote:
> >> LGTM with a nit only on the comment about the adapt_count decrement in
> >> unlock.
> >>
> >> On 21/12/2016 16:03, Tulio Magno Quites Machado Filho wrote:
> >>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
> >>>
> >>> Changes since v2:
> >>>  - Fixed coding style.
> >>>
> >>> Changes since v1:
> >>>  - Removed references to data race by the actual error: write-after-destroy.
> >>>  - Enclosed adapt_count accesses in C11 atomics.
> >>>
> >>> --- 8< ---
> >>>
> >>> The update of *adapt_count after the release of the lock causes a race
> >>> condition when thread A unlocks, thread B continues and destroys the
> >>> mutex, and thread A writes to *adapt_count.
> >>
> >>
> >>>  
> >>> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> >>> index 43c5a67..0e0baf5 100644
> >>> --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> >>> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
> >>> @@ -28,13 +28,17 @@ __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
> >>>      __libc_tend (0);
> >>>    else
> >>>      {
> >>> -      lll_unlock ((*lock), pshared);
> >>> -
> >>> -      /* Update the adapt count AFTER completing the critical section.
> >>> -         Doing this here prevents unneeded stalling when entering
> >>> -         a critical section.  Saving about 8% runtime on P8.  */
> >>> +      /* Update adapt_count in the critical section to prevent a
> >>> +	 write-after-destroy error as mentioned in BZ 20822.  The
> >>> +	 following update of adapt_count is clearly contained within
> >>> +	 the critical region of the fall-back lock as it immediately
> >>> +	 precedes the unlock.  Attempting to use C11 atomics to access
> >>> +	 adapt_count would be (at minimum) misleading and (at worse) a
> >>> +	 serious error forcing a larx-hit-stcx flush.  */
> >>>        if (*adapt_count > 0)
> >>>  	(*adapt_count)--;
> >>
> >> I think C11 atomic comment should outline that using atomic_fetch_add_relaxed
> >> generates suboptimal code, not misleading, since it generates an atomic sequence
> >> using lharx/sthcx. on a transaction region.  Something like:
> >>
> >>       /* Update adapt_count in the critical section to prevent a
> >> 	 write-after-destroy error as mentioned in BZ 20822.  The
> >> 	 following update of adapt_count is clearly contained within
> >> 	 the critical region of the fall-back lock as it immediately
> >> 	 precedes the unlock.  
> >> 	 Using C11 atomics like atomic_fetch_add_relaxed generates
> >> 	 suboptimal code (lharx/sthcx. sequence) where an plain load, 
> >> 	 update, store is suffice inside a HTM region (the whole
> >> 	 idea of using this construction).  */
> > 

Looking at this again, this is not guaranteed to be in a transaction.
(Wasn't all this supposed to always be in transactional code?  Or am I
mixing this up with another patch?)

It uses the lock, so can be concurrent with other accesses (that use
atomics).  Thus, it must use atomics too to prevent data races.

The comment should also mention the mutex destruction requirements by
POSIX because this is how we name that requirement everywhere else.

Adhemerval Zanella Jan. 2, 2017, 6:15 p.m. UTC | #7

On 02/01/2017 15:39, Torvald Riegel wrote:
> On Mon, 2017-01-02 at 15:25 -0200, Adhemerval Zanella wrote:
>>
>> On 02/01/2017 15:21, Torvald Riegel wrote:
>>> On Mon, 2017-01-02 at 15:03 -0200, Adhemerval Zanella wrote:
>>>> LGTM with a nit only on the comment about the adapt_count decrement in
>>>> unlock.
>>>>
>>>> On 21/12/2016 16:03, Tulio Magno Quites Machado Filho wrote:
>>>>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>>>>
>>>>> Changes since v2:
>>>>>  - Fixed coding style.
>>>>>
>>>>> Changes since v1:
>>>>>  - Removed references to data race by the actual error: write-after-destroy.
>>>>>  - Enclosed adapt_count accesses in C11 atomics.
>>>>>
>>>>> --- 8< ---
>>>>>
>>>>> The update of *adapt_count after the release of the lock causes a race
>>>>> condition when thread A unlocks, thread B continues and destroys the
>>>>> mutex, and thread A writes to *adapt_count.
>>>>
>>>>
>>>>>  
>>>>> diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
>>>>> index 43c5a67..0e0baf5 100644
>>>>> --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
>>>>> +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
>>>>> @@ -28,13 +28,17 @@ __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
>>>>>      __libc_tend (0);
>>>>>    else
>>>>>      {
>>>>> -      lll_unlock ((*lock), pshared);
>>>>> -
>>>>> -      /* Update the adapt count AFTER completing the critical section.
>>>>> -         Doing this here prevents unneeded stalling when entering
>>>>> -         a critical section.  Saving about 8% runtime on P8.  */
>>>>> +      /* Update adapt_count in the critical section to prevent a
>>>>> +	 write-after-destroy error as mentioned in BZ 20822.  The
>>>>> +	 following update of adapt_count is clearly contained within
>>>>> +	 the critical region of the fall-back lock as it immediately
>>>>> +	 precedes the unlock.  Attempting to use C11 atomics to access
>>>>> +	 adapt_count would be (at minimum) misleading and (at worse) a
>>>>> +	 serious error forcing a larx-hit-stcx flush.  */
>>>>>        if (*adapt_count > 0)
>>>>>  	(*adapt_count)--;
>>>>
>>>> I think C11 atomic comment should outline that using atomic_fetch_add_relaxed
>>>> generates suboptimal code, not misleading, since it generates an atomic sequence
>>>> using lharx/sthcx. on a transaction region.  Something like:
>>>>
>>>>       /* Update adapt_count in the critical section to prevent a
>>>> 	 write-after-destroy error as mentioned in BZ 20822.  The
>>>> 	 following update of adapt_count is clearly contained within
>>>> 	 the critical region of the fall-back lock as it immediately
>>>> 	 precedes the unlock.  
>>>> 	 Using C11 atomics like atomic_fetch_add_relaxed generates
>>>> 	 suboptimal code (lharx/sthcx. sequence) where an plain load, 
>>>> 	 update, store is suffice inside a HTM region (the whole
>>>> 	 idea of using this construction).  */
>>>
> 
> Looking at this again, this is not guaranteed to be in a transaction.
> (Wasn't all this supposed to always be in transactional code?  Or am I
> mixing this up with another patch?)

You are right, the adapt_count decrement is indeed on the default LL/SC
patch, not the HTM one.

> 
> It uses the lock, so can be concurrent with other accesses (that use
> atomics).  Thus, it must use atomics too to prevent data races.

Indeed, we need to use atomic decrement on this one as well.  I am not
sure if is needed to use a stronger semantic than relaxed.
> 
> The comment should also mention the mutex destruction requirements by
> POSIX because this is how we name that requirement everywhere else.
> 

Nod.

Torvald Riegel Jan. 2, 2017, 7:33 p.m. UTC | #8

On Mon, 2017-01-02 at 16:15 -0200, Adhemerval Zanella wrote:
> 
> On 02/01/2017 15:39, Torvald Riegel wrote:
> > It uses the lock, so can be concurrent with other accesses (that use
> > atomics).  Thus, it must use atomics too to prevent data races.
> 
> Indeed, we need to use atomic decrement on this one as well.  I am not
> sure if is needed to use a stronger semantic than relaxed.

adapt_count is just a hint, as is explained in the comments in the
similar patch I had sent for x86.  Given that it's just a hint, lost
updates are harmless for correctness, and the decrement does not need to
be atomic.  The invididual loads and stores of a decrement should be
though, to avoid data races.

diff mbox

Patch

diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
index dd1e4c3..86adbc9 100644
--- a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
@@ -45,7 +45,7 @@ 
 int
 __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
 {
-  if (*adapt_count > 0)
+  if (atomic_load_relaxed (adapt_count) > 0)
     {
       goto use_lock;
     }
@@ -67,7 +67,8 @@  __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
 	  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
 	    {
 	      if (aconf.skip_lock_internal_abort > 0)
-		*adapt_count = aconf.skip_lock_internal_abort;
+		atomic_store_relaxed (adapt_count,
+				      aconf.skip_lock_internal_abort);
 	      goto use_lock;
 	    }
 	}
@@ -75,7 +76,8 @@  __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
 
   /* Fall back to locks for a bit if retries have been exhausted */
   if (aconf.try_tbegin > 0 && aconf.skip_lock_out_of_tbegin_retries > 0)
-    *adapt_count = aconf.skip_lock_out_of_tbegin_retries;
+    atomic_store_relaxed (adapt_count,
+			  aconf.skip_lock_out_of_tbegin_retries);
 
 use_lock:
   return LLL_LOCK ((*lock), pshared);
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
index 0807a6a..6061856 100644
--- a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
@@ -34,7 +34,7 @@  __lll_trylock_elision (int *futex, short *adapt_count)
   __libc_tabort (_ABORT_NESTED_TRYLOCK);
 
   /* Only try a transaction if it's worth it.  */
-  if (*adapt_count > 0)
+  if (atomic_load_relaxed (adapt_count) > 0)
     {
       goto use_lock;
     }
@@ -49,7 +49,7 @@  __lll_trylock_elision (int *futex, short *adapt_count)
       __libc_tend (0);
 
       if (aconf.skip_lock_busy > 0)
-	*adapt_count = aconf.skip_lock_busy;
+	atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
     }
   else
     {
@@ -59,7 +59,8 @@  __lll_trylock_elision (int *futex, short *adapt_count)
 	     result in another failure.  Use normal locking now and
 	     for the next couple of calls.  */
 	  if (aconf.skip_trylock_internal_abort > 0)
-	    *adapt_count = aconf.skip_trylock_internal_abort;
+	    atomic_store_relaxed (adapt_count,
+				aconf.skip_trylock_internal_abort);
 	}
     }
 
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
index 43c5a67..0e0baf5 100644
--- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
@@ -28,13 +28,17 @@  __lll_unlock_elision (int *lock, short *adapt_count, int pshared)
     __libc_tend (0);
   else
     {
-      lll_unlock ((*lock), pshared);
-
-      /* Update the adapt count AFTER completing the critical section.
-         Doing this here prevents unneeded stalling when entering
-         a critical section.  Saving about 8% runtime on P8.  */
+      /* Update adapt_count in the critical section to prevent a
+	 write-after-destroy error as mentioned in BZ 20822.  The
+	 following update of adapt_count is clearly contained within
+	 the critical region of the fall-back lock as it immediately
+	 precedes the unlock.  Attempting to use C11 atomics to access
+	 adapt_count would be (at minimum) misleading and (at worse) a
+	 serious error forcing a larx-hit-stcx flush.  */
       if (*adapt_count > 0)
 	(*adapt_count)--;
+
+      lll_unlock ((*lock), pshared);
     }
   return 0;
 }