[v2] PowerPC: Fix a race condition when eliding a lock

Message ID 1440084054-32243-1-git-send-email-tuliom@linux.vnet.ibm.com
State Superseded
Delegated to: Carlos O'Donell
Headers

Commit Message

Tulio Magno Quites Machado Filho Aug. 20, 2015, 3:20 p.m. UTC
  Changes since v1:
 - Improved commit message.
 - Added new comments in the source code alerting to the concurrency details
   intrinsic to this code.
 - Removed compiler barriers from this patch, which will be treated in another
   patch and will be synchronized with the GCC implementation.

8<----------

The previous code used to evaluate the preprocessor token is_lock_free to
a variable before starting a transaction.  This behavior can cause an
error if another thread got the lock (without using a transaction)
between the evaluation of the token and the beginning of the transaction.

This patch delays the evaluation of is_lock_free to inside a transaction
by moving this part of the code to the macro ELIDE_LOCK.

2015-08-20  Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>

	[BZ #18743]
	* sysdeps/powerpc/nptl/elide.h (__elide_lock): Move most of this
	code to...
	(ELIDE_LOCK): ...here.
	(__get_new_count): New function with part of the code from
	__elide_lock that updates the value of adapt_count after a
	transaction abort.
	(__elided_trylock): Moved this code to...
	(ELIDE_TRYLOCK): ...here.
---
 NEWS                         |   3 +-
 sysdeps/powerpc/nptl/elide.h | 120 ++++++++++++++++++++++++-------------------
 2 files changed, 70 insertions(+), 53 deletions(-)
  

Comments

Carlos O'Donell Aug. 20, 2015, 7:21 p.m. UTC | #1
On 08/20/2015 11:20 AM, Tulio Magno Quites Machado Filho wrote:
> Changes since v1:
>  - Improved commit message.
>  - Added new comments in the source code alerting to the concurrency details
>    intrinsic to this code.
>  - Removed compiler barriers from this patch, which will be treated in another
>    patch and will be synchronized with the GCC implementation.

The architecture and idea of this change is correct, but there are still
a few details we need to get right. It's almost there, probably v3 and
we'll be done. See my comments below.

> 8<----------
> 
> The previous code used to evaluate the preprocessor token is_lock_free to
> a variable before starting a transaction.  This behavior can cause an
> error if another thread got the lock (without using a transaction)
> between the evaluation of the token and the beginning of the transaction.
> 
> This patch delays the evaluation of is_lock_free to inside a transaction
> by moving this part of the code to the macro ELIDE_LOCK.

This is looking better, but the comments talk only about the error
you are fixing, and not the overall concurrency scheme. The latter
is really what we want to document, and from that documentation it
should flow naturally that is_lock_free must, out of a requirement,
can be read within the transaction safely.

> 2015-08-20  Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>
> 
> 	[BZ #18743]
> 	* sysdeps/powerpc/nptl/elide.h (__elide_lock): Move most of this
> 	code to...
> 	(ELIDE_LOCK): ...here.
> 	(__get_new_count): New function with part of the code from
> 	__elide_lock that updates the value of adapt_count after a
> 	transaction abort.
> 	(__elided_trylock): Moved this code to...
> 	(ELIDE_TRYLOCK): ...here.
> ---
>  NEWS                         |   3 +-
>  sysdeps/powerpc/nptl/elide.h | 120 ++++++++++++++++++++++++-------------------
>  2 files changed, 70 insertions(+), 53 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index b75a202..49aba2d 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -11,7 +11,8 @@ Version 2.23
>  
>    14341, 16517, 16519, 16520, 16734, 16973, 17787, 17905, 18084, 18086,
>    18265, 18370, 18421, 18480, 18525, 18618, 18647, 18661, 18674, 18681,
> -  18778, 18781, 18787, 18789, 18790, 18795, 18796, 18820, 18823, 18824.
> +  18778, 18781, 18787, 18789, 18790, 18795, 18796, 18820, 18823, 18824,
> +  18743.
>  
>  * The obsolete header <regexp.h> has been removed.  Programs that require
>    this header must be updated to use <regex.h> instead.
> diff --git a/sysdeps/powerpc/nptl/elide.h b/sysdeps/powerpc/nptl/elide.h
> index 389f5a5..261efd5 100644
> --- a/sysdeps/powerpc/nptl/elide.h
> +++ b/sysdeps/powerpc/nptl/elide.h
> @@ -23,67 +23,83 @@
>  # include <htm.h>
>  # include <elision-conf.h>
>  
> -/* Returns true if the lock defined by is_lock_free as elided.
> -   ADAPT_COUNT is a pointer to per-lock state variable. */
> -
> +/* Get the new value of adapt_count according to the elision
> +   configurations.  Returns true if the system should retry again or false
> +   otherwise.  */
>  static inline bool
> -__elide_lock (uint8_t *adapt_count, int is_lock_free)
> +__get_new_count (uint8_t *adapt_count)
>  {
> -  if (*adapt_count > 0)
> +  /* A persistent failure indicates that a retry will probably
> +     result in another failure.  Use normal locking now and
> +     for the next couple of calls.  */
> +  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
>      {
> -      (*adapt_count)--;
> +      if (__elision_aconf.skip_lock_internal_abort > 0)
> +	*adapt_count = __elision_aconf.skip_lock_internal_abort;
>        return false;
>      }
> -
> -  for (int i = __elision_aconf.try_tbegin; i > 0; i--)
> -    {
> -      if (__builtin_tbegin (0))
> -	{
> -	  if (is_lock_free)
> -	    return true;
> -	  /* Lock was busy.  */
> -	  __builtin_tabort (_ABORT_LOCK_BUSY);
> -	}
> -      else
> -	{
> -	  /* A persistent failure indicates that a retry will probably
> -	     result in another failure.  Use normal locking now and
> -	     for the next couple of calls.  */
> -	  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
> -	    {
> -	      if (__elision_aconf.skip_lock_internal_abort > 0)
> -		*adapt_count = __elision_aconf.skip_lock_internal_abort;
> -	      break;
> -	    }
> -	  /* Same logic as above, but for a number of temporary failures in a
> -	     a row.  */
> -	  else if (__elision_aconf.skip_lock_out_of_tbegin_retries > 0
> -		   && __elision_aconf.try_tbegin > 0)
> -	    *adapt_count = __elision_aconf.skip_lock_out_of_tbegin_retries;
> -	}
> -     }
> -
> -  return false;
> +  /* Same logic as above, but for a number of temporary failures in a
> +     a row.  */
> +  else if (__elision_aconf.skip_lock_out_of_tbegin_retries > 0
> +	   && __elision_aconf.try_tbegin > 0)
> +    *adapt_count = __elision_aconf.skip_lock_out_of_tbegin_retries;
> +  return true;

OK.

>  }
>  
> -# define ELIDE_LOCK(adapt_count, is_lock_free) \
> -  __elide_lock (&(adapt_count), is_lock_free)
> -
> -
> -static inline bool
> -__elide_trylock (uint8_t *adapt_count, int is_lock_free, int write)
> -{
> -  if (__elision_aconf.try_tbegin > 0)
> -    {
> -      if (write)
> -	__builtin_tabort (_ABORT_NESTED_TRYLOCK);
> -      return __elide_lock (adapt_count, is_lock_free);
> -    }
> -  return false;
> -}
> +/* CONCURRENCY NOTES:
> +
> +   Always evaluate is_lock_free from inside a transaction in macros
> +   ELIDE_LOCK and ELIDE_TRYLOCK.
> +   Failing to do so, creates a race condition to the memory access specified
> +   in is_lock_free which can be triggered with the following order of
> +   events:
> +   1. The lock accessed by is_lock_free is free.
> +   2. Thread T1 evaluates is_lock_free and stores into register R1 that the
> +      lock is free.
> +   3. Thread T2 acquires the same lock used in is_lock_free.
> +   4. T1 begins the transaction, creating a memory barrier where is_lock_free
> +      is false, but R1 is true.
> +   5. T1 reads R1 and doesn't abort the transaction.
> +   6. T1 calls ELIDE_UNLOCK, which reads false from is_lock_free and decides
> +      to unlock a lock acquired by T2, leading to undefined behavior.  */

This talks about this particular problem, but should instead talk about
why is_lock_free need not have any locks or atomic accesses.

e.g.

/* CONCURRENCY NOTES:

   The value of is_lock_free is read and possibly written to by multiple
   threads, either during lock acquisition, or elision. In order to avoid
   this evaluation from becoming a data race the access of is_lock_free
   is placed *inside* the transaction. Within the transaction is_lock_free
   can be accessed with relaxed ordering. We are assured by the transaction
   that the value of is_lock_free is consistent with the state of the lock,
   otherwise the hardware will abort the transaction.

   ... */

Feel free to add back the notes on the invalid case if you want, but the
above is the minimum I'm expecting. It's a definition of the intent of the
current code.

Does that make sense?

Perhaps Torvald can review this too.

I apologize in advance for making this take so long, but we all need
to use the same language and calibrate our expectations regarding
the level of detail that we need here and the language used.

> +
> +/* Returns 0 if the lock defined by is_lock_free was elided.
> +   ADAPT_COUNT is a per-lock state variable.  */
> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
> +  ({									\
> +    int ret = 0;							\
> +    if (adapt_count > 0)						\
> +      (adapt_count)--;							\
> +    else								\
> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
> +	{								\
> +	  if (__builtin_tbegin (0))					\
> +	    {								\
> +	      if (is_lock_free)						\

This should use a relaxed MO atomic access to highlight that this data *is*
shared between threads, but that there are no ordering guarantees. It should
result in a normal load, but it makes it clear in the code that some other
thread is also accessing this data, and that the transaction as a barrier
is all that is protecting this from being a data-race (undefined behaviour).

Torvlad made the same comment in his review, and I'm just repeating it here
because I think it bears repeating.

> +		{							\
> +		  ret = 1;						\
> +		  break;						\
> +		}							\
> +	      __builtin_tabort (_ABORT_LOCK_BUSY);			\
> +	    }								\
> +	  else								\
> +	    if (!__get_new_count(&adapt_count))				\
> +	      break;							\
> +	}								\
> +    ret;								\
> +  })
>  
>  # define ELIDE_TRYLOCK(adapt_count, is_lock_free, write)	\
> -  __elide_trylock (&(adapt_count), is_lock_free, write)
> +  ({								\
> +    int ret = 0;						\
> +    if (__elision_aconf.try_tbegin > 0)				\
> +      {								\
> +	if (write)						\
> +	  __builtin_tabort (_ABORT_NESTED_TRYLOCK);		\
> +	ret = ELIDE_LOCK (adapt_count, is_lock_free);		\
> +      }								\
> +    ret;							\
> +  })
>  
>  
>  static inline bool
> 


Cheers,
Carlos.
  
Adhemerval Zanella Aug. 20, 2015, 7:51 p.m. UTC | #2
>> +
>> +/* Returns 0 if the lock defined by is_lock_free was elided.
>> +   ADAPT_COUNT is a per-lock state variable.  */
>> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
>> +  ({									\
>> +    int ret = 0;							\
>> +    if (adapt_count > 0)						\
>> +      (adapt_count)--;							\
>> +    else								\
>> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
>> +	{								\
>> +	  if (__builtin_tbegin (0))					\
>> +	    {								\
>> +	      if (is_lock_free)						\
> 
> This should use a relaxed MO atomic access to highlight that this data *is*
> shared between threads, but that there are no ordering guarantees. It should
> result in a normal load, but it makes it clear in the code that some other
> thread is also accessing this data, and that the transaction as a barrier
> is all that is protecting this from being a data-race (undefined behaviour).

Should we really highlight it using atomic operations? The idea is exactly
that any access within a transaction will not result in undefined behavior,
so no need to atomic for synchronization.

> 
> Torvlad made the same comment in his review, and I'm just repeating it here
> because I think it bears repeating.
> 
>> +		{							\
>> +		  ret = 1;						\
>> +		  break;						\
>> +		}							\
>> +	      __builtin_tabort (_ABORT_LOCK_BUSY);			\
>> +	    }								\
>> +	  else								\
>> +	    if (!__get_new_count(&adapt_count))				\
>> +	      break;							\
>> +	}								\
>> +    ret;								\
>> +  })
>>  
>>  # define ELIDE_TRYLOCK(adapt_count, is_lock_free, write)	\
>> -  __elide_trylock (&(adapt_count), is_lock_free, write)
>> +  ({								\
>> +    int ret = 0;						\
>> +    if (__elision_aconf.try_tbegin > 0)				\
>> +      {								\
>> +	if (write)						\
>> +	  __builtin_tabort (_ABORT_NESTED_TRYLOCK);		\
>> +	ret = ELIDE_LOCK (adapt_count, is_lock_free);		\
>> +      }								\
>> +    ret;							\
>> +  })
>>  
>>  
>>  static inline bool
>>
> 
> 
> Cheers,
> Carlos.
>
  
Tulio Magno Quites Machado Filho Aug. 20, 2015, 9:52 p.m. UTC | #3
"Carlos O'Donell" <carlos@redhat.com> writes:

> On 08/20/2015 11:20 AM, Tulio Magno Quites Machado Filho wrote:
>> Changes since v1:
>>  - Improved commit message.
>>  - Added new comments in the source code alerting to the concurrency details
>>    intrinsic to this code.
>>  - Removed compiler barriers from this patch, which will be treated in another
>>    patch and will be synchronized with the GCC implementation.
>
> The architecture and idea of this change is correct, but there are still
> a few details we need to get right. It's almost there, probably v3 and
> we'll be done. See my comments below.

;-)

>> The previous code used to evaluate the preprocessor token is_lock_free to
>> a variable before starting a transaction.  This behavior can cause an
>> error if another thread got the lock (without using a transaction)
>> between the evaluation of the token and the beginning of the transaction.
>> 
>> This patch delays the evaluation of is_lock_free to inside a transaction
>> by moving this part of the code to the macro ELIDE_LOCK.
>
> This is looking better, but the comments talk only about the error
> you are fixing, and not the overall concurrency scheme. The latter
> is really what we want to document, and from that documentation it
> should flow naturally that is_lock_free must, out of a requirement,
> can be read within the transaction safely.

Just to be clear: when you say "the comments talk", are you referring to the
commit message or source code comments?

>> +/* CONCURRENCY NOTES:
>> +
>> +   Always evaluate is_lock_free from inside a transaction in macros
>> +   ELIDE_LOCK and ELIDE_TRYLOCK.
>> +   Failing to do so, creates a race condition to the memory access specified
>> +   in is_lock_free which can be triggered with the following order of
>> +   events:
>> +   1. The lock accessed by is_lock_free is free.
>> +   2. Thread T1 evaluates is_lock_free and stores into register R1 that the
>> +      lock is free.
>> +   3. Thread T2 acquires the same lock used in is_lock_free.
>> +   4. T1 begins the transaction, creating a memory barrier where is_lock_free
>> +      is false, but R1 is true.
>> +   5. T1 reads R1 and doesn't abort the transaction.
>> +   6. T1 calls ELIDE_UNLOCK, which reads false from is_lock_free and decides
>> +      to unlock a lock acquired by T2, leading to undefined behavior.  */
>
> This talks about this particular problem, but should instead talk about
> why is_lock_free need not have any locks or atomic accesses.
>
> e.g.
>
> /* CONCURRENCY NOTES:
>
>    The value of is_lock_free is read and possibly written to by multiple
>    threads, either during lock acquisition, or elision. In order to avoid
>    this evaluation from becoming a data race the access of is_lock_free
>    is placed *inside* the transaction. Within the transaction is_lock_free
>    can be accessed with relaxed ordering. We are assured by the transaction
>    that the value of is_lock_free is consistent with the state of the lock,
>    otherwise the hardware will abort the transaction.
>
>    ... */

Agreed.  The source code becomes much more useful with your comments.  ;-)

> Feel free to add back the notes on the invalid case if you want, but the
> above is the minimum I'm expecting. It's a definition of the intent of the
> current code.

Maybe the invalid case should only be available in the commit message.

> Does that make sense?

Yes.

> Perhaps Torvald can review this too.
>
> I apologize in advance for making this take so long, but we all need
> to use the same language and calibrate our expectations regarding
> the level of detail that we need here and the language used.

No problem.  And thank you for spending your time on this review.

>> +
>> +/* Returns 0 if the lock defined by is_lock_free was elided.
>> +   ADAPT_COUNT is a per-lock state variable.  */
>> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
>> +  ({									\
>> +    int ret = 0;							\
>> +    if (adapt_count > 0)						\
>> +      (adapt_count)--;							\
>> +    else								\
>> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
>> +	{								\
>> +	  if (__builtin_tbegin (0))					\
>> +	    {								\
>> +	      if (is_lock_free)						\
>
> This should use a relaxed MO atomic access to highlight that this data *is*
> shared between threads, but that there are no ordering guarantees. It should
> result in a normal load, but it makes it clear in the code that some other
> thread is also accessing this data, and that the transaction as a barrier
> is all that is protecting this from being a data-race (undefined behaviour).
>
> Torvlad made the same comment in his review, and I'm just repeating it here
> because I think it bears repeating.

Repeating it is a good idea because I was missing it.
But I still fail to understand how this will make the code clear.
In fact, I'm afraid it could make it more confusing, e.g. "if I have the
atomic access, why do I need this transaction?  Let's remove it."

Maybe, if I had an example I'd better understand what both of you want.

Thanks!
  
Torvald Riegel Aug. 21, 2015, 9:52 a.m. UTC | #4
On Thu, 2015-08-20 at 16:51 -0300, Adhemerval Zanella wrote:
> 
> 
> >> +
> >> +/* Returns 0 if the lock defined by is_lock_free was elided.
> >> +   ADAPT_COUNT is a per-lock state variable.  */
> >> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
> >> +  ({									\
> >> +    int ret = 0;							\
> >> +    if (adapt_count > 0)						\
> >> +      (adapt_count)--;							\
> >> +    else								\
> >> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
> >> +	{								\
> >> +	  if (__builtin_tbegin (0))					\
> >> +	    {								\
> >> +	      if (is_lock_free)						\
> > 
> > This should use a relaxed MO atomic access to highlight that this data *is*
> > shared between threads, but that there are no ordering guarantees. It should
> > result in a normal load, but it makes it clear in the code that some other
> > thread is also accessing this data, and that the transaction as a barrier
> > is all that is protecting this from being a data-race (undefined behaviour).
> 
> Should we really highlight it using atomic operations? The idea is exactly
> that any access within a transaction will not result in undefined behavior,
> so no need to atomic for synchronization.

I think it's better to highlight it.  If the data would only be accessed
from within transaction, the highlighting wouldn't be necessary.  But it
is mixed transactional/nontransactional synchronization in this case, so
we need the annotation.

This also makes moving to proper atomic types easier should we ever
decide to do that.
  
Carlos O'Donell Aug. 21, 2015, 3:22 p.m. UTC | #5
On 08/20/2015 03:51 PM, Adhemerval Zanella wrote:
> 
> 
> 
>>> +
>>> +/* Returns 0 if the lock defined by is_lock_free was elided.
>>> +   ADAPT_COUNT is a per-lock state variable.  */
>>> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
>>> +  ({									\
>>> +    int ret = 0;							\
>>> +    if (adapt_count > 0)						\
>>> +      (adapt_count)--;							\
>>> +    else								\
>>> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
>>> +	{								\
>>> +	  if (__builtin_tbegin (0))					\
>>> +	    {								\
>>> +	      if (is_lock_free)						\
>>
>> This should use a relaxed MO atomic access to highlight that this data *is*
>> shared between threads, but that there are no ordering guarantees. It should
>> result in a normal load, but it makes it clear in the code that some other
>> thread is also accessing this data, and that the transaction as a barrier
>> is all that is protecting this from being a data-race (undefined behaviour).
> 
> Should we really highlight it using atomic operations? The idea is exactly
> that any access within a transaction will not result in undefined behavior,
> so no need to atomic for synchronization.

I withdraw my suggestion.

I can't come up with a strong enough argument that isn't already solved by
a high quality comment about how the macro works.

Cheers,
Carlos.
  
Carlos O'Donell Aug. 21, 2015, 3:25 p.m. UTC | #6
On 08/20/2015 05:52 PM, Tulio Magno Quites Machado Filho wrote:
>> This is looking better, but the comments talk only about the error
>> you are fixing, and not the overall concurrency scheme. The latter
>> is really what we want to document, and from that documentation it
>> should flow naturally that is_lock_free must, out of a requirement,
>> can be read within the transaction safely.
> 
> Just to be clear: when you say "the comments talk", are you referring to the
> commit message or source code comments?

Source code comments.
 
>>> +/* CONCURRENCY NOTES:
>>> +
>>> +   Always evaluate is_lock_free from inside a transaction in macros
>>> +   ELIDE_LOCK and ELIDE_TRYLOCK.
>>> +   Failing to do so, creates a race condition to the memory access specified
>>> +   in is_lock_free which can be triggered with the following order of
>>> +   events:
>>> +   1. The lock accessed by is_lock_free is free.
>>> +   2. Thread T1 evaluates is_lock_free and stores into register R1 that the
>>> +      lock is free.
>>> +   3. Thread T2 acquires the same lock used in is_lock_free.
>>> +   4. T1 begins the transaction, creating a memory barrier where is_lock_free
>>> +      is false, but R1 is true.
>>> +   5. T1 reads R1 and doesn't abort the transaction.
>>> +   6. T1 calls ELIDE_UNLOCK, which reads false from is_lock_free and decides
>>> +      to unlock a lock acquired by T2, leading to undefined behavior.  */
>>
>> This talks about this particular problem, but should instead talk about
>> why is_lock_free need not have any locks or atomic accesses.
>>
>> e.g.
>>
>> /* CONCURRENCY NOTES:
>>
>>    The value of is_lock_free is read and possibly written to by multiple
>>    threads, either during lock acquisition, or elision. In order to avoid
>>    this evaluation from becoming a data race the access of is_lock_free
>>    is placed *inside* the transaction. Within the transaction is_lock_free
>>    can be accessed with relaxed ordering. We are assured by the transaction
>>    that the value of is_lock_free is consistent with the state of the lock,
>>    otherwise the hardware will abort the transaction.
>>
>>    ... */
> 
> Agreed.  The source code becomes much more useful with your comments.  ;-)

Thanks.

>> Feel free to add back the notes on the invalid case if you want, but the
>> above is the minimum I'm expecting. It's a definition of the intent of the
>> current code.
> 
> Maybe the invalid case should only be available in the commit message.

Very good idea. Yes, put them into the commit message.

> Repeating it is a good idea because I was missing it.
> But I still fail to understand how this will make the code clear.
> In fact, I'm afraid it could make it more confusing, e.g. "if I have the
> atomic access, why do I need this transaction?  Let's remove it."
> 
> Maybe, if I had an example I'd better understand what both of you want.

I withdraw my suggestion. As I mentioned to Adhemerval, I can't come up with
a strong reason that isn't already solved by a high quality comment about
concurrency.

Cheers,
Carlos.
  
Carlos O'Donell Aug. 25, 2015, 4:01 a.m. UTC | #7
On 08/21/2015 05:52 AM, Torvald Riegel wrote:
> On Thu, 2015-08-20 at 16:51 -0300, Adhemerval Zanella wrote:
>>
>>
>>>> +
>>>> +/* Returns 0 if the lock defined by is_lock_free was elided.
>>>> +   ADAPT_COUNT is a per-lock state variable.  */
>>>> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
>>>> +  ({									\
>>>> +    int ret = 0;							\
>>>> +    if (adapt_count > 0)						\
>>>> +      (adapt_count)--;							\
>>>> +    else								\
>>>> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
>>>> +	{								\
>>>> +	  if (__builtin_tbegin (0))					\
>>>> +	    {								\
>>>> +	      if (is_lock_free)						\
>>>
>>> This should use a relaxed MO atomic access to highlight that this data *is*
>>> shared between threads, but that there are no ordering guarantees. It should
>>> result in a normal load, but it makes it clear in the code that some other
>>> thread is also accessing this data, and that the transaction as a barrier
>>> is all that is protecting this from being a data-race (undefined behaviour).
>>
>> Should we really highlight it using atomic operations? The idea is exactly
>> that any access within a transaction will not result in undefined behavior,
>> so no need to atomic for synchronization.
> 
> I think it's better to highlight it.  If the data would only be accessed
> from within transaction, the highlighting wouldn't be necessary.  But it
> is mixed transactional/nontransactional synchronization in this case, so
> we need the annotation.

I'll let you discuss this with Tulio. Unless there is a correctness issue
or a violation of our rule to be data race free, then there should be some
flexibility granted to IBM to do what they wish, and particularly that which
is optimal. While the atomic_load_relaxed doesn't appear to do anything that
would incur a serious performance penality, it does add an asm with inputs
that force the value to a register and this may limit the compiler in some
future way.

From my perspective a code comment on concurrency seems sufficient?

If thread T1 and a thread T2 read or write memory that is used to evaluate
the value of is_lock_free, then the transaction would be aborted and thus
is_lock_free needs no atomic access?

> This also makes moving to proper atomic types easier should we ever
> decide to do that.

Could you please elaborate a bit more on this?

Cheers,
Carlos.
  
Torvald Riegel Aug. 25, 2015, 11:05 a.m. UTC | #8
On Tue, 2015-08-25 at 00:01 -0400, Carlos O'Donell wrote:
> On 08/21/2015 05:52 AM, Torvald Riegel wrote:
> > On Thu, 2015-08-20 at 16:51 -0300, Adhemerval Zanella wrote:
> >>
> >>
> >>>> +
> >>>> +/* Returns 0 if the lock defined by is_lock_free was elided.
> >>>> +   ADAPT_COUNT is a per-lock state variable.  */
> >>>> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
> >>>> +  ({									\
> >>>> +    int ret = 0;							\
> >>>> +    if (adapt_count > 0)						\
> >>>> +      (adapt_count)--;							\
> >>>> +    else								\
> >>>> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
> >>>> +	{								\
> >>>> +	  if (__builtin_tbegin (0))					\
> >>>> +	    {								\
> >>>> +	      if (is_lock_free)						\
> >>>
> >>> This should use a relaxed MO atomic access to highlight that this data *is*
> >>> shared between threads, but that there are no ordering guarantees. It should
> >>> result in a normal load, but it makes it clear in the code that some other
> >>> thread is also accessing this data, and that the transaction as a barrier
> >>> is all that is protecting this from being a data-race (undefined behaviour).
> >>
> >> Should we really highlight it using atomic operations? The idea is exactly
> >> that any access within a transaction will not result in undefined behavior,
> >> so no need to atomic for synchronization.
> > 
> > I think it's better to highlight it.  If the data would only be accessed
> > from within transaction, the highlighting wouldn't be necessary.  But it
> > is mixed transactional/nontransactional synchronization in this case, so
> > we need the annotation.
> 
> I'll let you discuss this with Tulio. Unless there is a correctness issue
> or a violation of our rule to be data race free, then there should be some
> flexibility granted to IBM to do what they wish, and particularly that which
> is optimal. While the atomic_load_relaxed doesn't appear to do anything that
> would incur a serious performance penality, it does add an asm with inputs
> that force the value to a register and this may limit the compiler in some
> future way.

Note that the atomic will be using a compiler built-in (eventually), so
the compiler isn't at a disadvantage.

> From my perspective a code comment on concurrency seems sufficient?
> 
> If thread T1 and a thread T2 read or write memory that is used to evaluate
> the value of is_lock_free, then the transaction would be aborted and thus
> is_lock_free needs no atomic access?
> 
> > This also makes moving to proper atomic types easier should we ever
> > decide to do that.
> 
> Could you please elaborate a bit more on this?

If we would be using C11 or C++11, then we would need to use an atomic
type for is_lock_free because we need atomic accesses outside of
transactions.  The atomic types do not offer a way to access the data
nonatomically (this is being worked on in C++ SG1, but for other
reasons).

Thus, if we'd use real atomic types at some point, then we would not
want people to access them nonatomically, accidentally.  (And we
wouldn't want to have seq-cst default accesses either.)
Therefore, to make this possible in the future, we should stick to the
rule that data accessed atomically somewhere is considered to be of
atomic type (even though we actually use nonatomic types in the
declaration), and all accesses to it are also using atomics.  IMO, this
is just cleaner and there's no reason to make an exception for the few
variables that we use in mixed nontxnal/txnal synchronization.  (In
contrast, using nonatomic accesses for initialization can provide
benefits such as using memset etc.)
  
Carlos O'Donell Aug. 25, 2015, 2:15 p.m. UTC | #9
On 08/25/2015 07:05 AM, Torvald Riegel wrote:
> On Tue, 2015-08-25 at 00:01 -0400, Carlos O'Donell wrote:
>> On 08/21/2015 05:52 AM, Torvald Riegel wrote:
>>> On Thu, 2015-08-20 at 16:51 -0300, Adhemerval Zanella wrote:
>>>>
>>>>
>>>>>> +
>>>>>> +/* Returns 0 if the lock defined by is_lock_free was elided.
>>>>>> +   ADAPT_COUNT is a per-lock state variable.  */
>>>>>> +# define ELIDE_LOCK(adapt_count, is_lock_free)				\
>>>>>> +  ({									\
>>>>>> +    int ret = 0;							\
>>>>>> +    if (adapt_count > 0)						\
>>>>>> +      (adapt_count)--;							\
>>>>>> +    else								\
>>>>>> +      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
>>>>>> +	{								\
>>>>>> +	  if (__builtin_tbegin (0))					\
>>>>>> +	    {								\
>>>>>> +	      if (is_lock_free)						\
>>>>>
>>>>> This should use a relaxed MO atomic access to highlight that this data *is*
>>>>> shared between threads, but that there are no ordering guarantees. It should
>>>>> result in a normal load, but it makes it clear in the code that some other
>>>>> thread is also accessing this data, and that the transaction as a barrier
>>>>> is all that is protecting this from being a data-race (undefined behaviour).
>>>>
>>>> Should we really highlight it using atomic operations? The idea is exactly
>>>> that any access within a transaction will not result in undefined behavior,
>>>> so no need to atomic for synchronization.
>>>
>>> I think it's better to highlight it.  If the data would only be accessed
>>> from within transaction, the highlighting wouldn't be necessary.  But it
>>> is mixed transactional/nontransactional synchronization in this case, so
>>> we need the annotation.
>>
>> I'll let you discuss this with Tulio. Unless there is a correctness issue
>> or a violation of our rule to be data race free, then there should be some
>> flexibility granted to IBM to do what they wish, and particularly that which
>> is optimal. While the atomic_load_relaxed doesn't appear to do anything that
>> would incur a serious performance penality, it does add an asm with inputs
>> that force the value to a register and this may limit the compiler in some
>> future way.
> 
> Note that the atomic will be using a compiler built-in (eventually), so
> the compiler isn't at a disadvantage.

Good point.

>> From my perspective a code comment on concurrency seems sufficient?
>>
>> If thread T1 and a thread T2 read or write memory that is used to evaluate
>> the value of is_lock_free, then the transaction would be aborted and thus
>> is_lock_free needs no atomic access?
>>
>>> This also makes moving to proper atomic types easier should we ever
>>> decide to do that.
>>
>> Could you please elaborate a bit more on this?
> 
> If we would be using C11 or C++11, then we would need to use an atomic
> type for is_lock_free because we need atomic accesses outside of
> transactions.  The atomic types do not offer a way to access the data
> nonatomically (this is being worked on in C++ SG1, but for other
> reasons).

As a concrete example the structure element that is accessed atomically is
rwlock->__data.__lock. We access it atomically in lll_lock and also
in the txnal region (is region the right word?).

The access is part of:

nptl/pthread_rwlock_wrlock.c 

100   if (ELIDE_LOCK (rwlock->__data.__rwelision,
101                   rwlock->__data.__lock == 0
102                   && rwlock->__data.__writer == 0
103                   && rwlock->__data.__nr_readers == 0))
104     return 0;

With the intent being for the expression to be evaluted inside of the
transaction and thus abort if another thread has touched any of those
fields.

That entire expression expands into the usage of is_lock_free. Therefore
shouldn't the change be at the caller site?

e.g.

100   if (ELIDE_LOCK (rwlock->__data.__rwelision,
101                   atomic_load_relaxed (rwlock->__data.__lock) == 0
102                   && rwlock->__data.__writer == 0
103                   && rwlock->__data.__nr_readers == 0))
104     return 0;

Since the powerpc elision backend doesn't know anything about the types
that went into the evaluation of is_lock_free?

If anything I think *we* have the onus to fix these cases of missing
atomic_load_relaxed not IBM?

Thoughts?

> Thus, if we'd use real atomic types at some point, then we would not
> want people to access them nonatomically, accidentally.  (And we
> wouldn't want to have seq-cst default accesses either.)
> Therefore, to make this possible in the future, we should stick to the
> rule that data accessed atomically somewhere is considered to be of
> atomic type (even though we actually use nonatomic types in the
> declaration), and all accesses to it are also using atomics.  IMO, this
> is just cleaner and there's no reason to make an exception for the few
> variables that we use in mixed nontxnal/txnal synchronization.  (In
> contrast, using nonatomic accesses for initialization can provide
> benefits such as using memset etc.)

I can agree with that.

c.
  
Torvald Riegel Aug. 25, 2015, 4:37 p.m. UTC | #10
On Tue, 2015-08-25 at 10:15 -0400, Carlos O'Donell wrote:
> As a concrete example the structure element that is accessed atomically is
> rwlock->__data.__lock. We access it atomically in lll_lock and also
> in the txnal region (is region the right word?).

Txnal region is used by some, so I think this works.  Just transaction
would work as well I think.

> The access is part of:
> 
> nptl/pthread_rwlock_wrlock.c 
> 
> 100   if (ELIDE_LOCK (rwlock->__data.__rwelision,
> 101                   rwlock->__data.__lock == 0
> 102                   && rwlock->__data.__writer == 0
> 103                   && rwlock->__data.__nr_readers == 0))
> 104     return 0;
> 
> With the intent being for the expression to be evaluted inside of the
> transaction and thus abort if another thread has touched any of those
> fields.
> 
> That entire expression expands into the usage of is_lock_free. Therefore
> shouldn't the change be at the caller site?

That would be an option, or we should pass in a function or something.

> e.g.
> 
> 100   if (ELIDE_LOCK (rwlock->__data.__rwelision,
> 101                   atomic_load_relaxed (rwlock->__data.__lock) == 0
> 102                   && rwlock->__data.__writer == 0
> 103                   && rwlock->__data.__nr_readers == 0))
> 104     return 0;
> 
> Since the powerpc elision backend doesn't know anything about the types
> that went into the evaluation of is_lock_free?
> 
> If anything I think *we* have the onus to fix these cases of missing
> atomic_load_relaxed not IBM?

Somebody should do it :)  I hadn't thought too much about for whom it
would be most convenient to do.  As I wrote in the review for Tulio's
patch, IMO passing in an expression into a macro that has to be
evaluated in there is pretty ugly and seems to be at least related to
the bug Tulio is fixing.  Maybe we can pass in a function that is
inlined by the compiler.
I do agree that IBM just used what the Intel implementation did, and
thus didn't create that in the first place.
  

Patch

diff --git a/NEWS b/NEWS
index b75a202..49aba2d 100644
--- a/NEWS
+++ b/NEWS
@@ -11,7 +11,8 @@  Version 2.23
 
   14341, 16517, 16519, 16520, 16734, 16973, 17787, 17905, 18084, 18086,
   18265, 18370, 18421, 18480, 18525, 18618, 18647, 18661, 18674, 18681,
-  18778, 18781, 18787, 18789, 18790, 18795, 18796, 18820, 18823, 18824.
+  18778, 18781, 18787, 18789, 18790, 18795, 18796, 18820, 18823, 18824,
+  18743.
 
 * The obsolete header <regexp.h> has been removed.  Programs that require
   this header must be updated to use <regex.h> instead.
diff --git a/sysdeps/powerpc/nptl/elide.h b/sysdeps/powerpc/nptl/elide.h
index 389f5a5..261efd5 100644
--- a/sysdeps/powerpc/nptl/elide.h
+++ b/sysdeps/powerpc/nptl/elide.h
@@ -23,67 +23,83 @@ 
 # include <htm.h>
 # include <elision-conf.h>
 
-/* Returns true if the lock defined by is_lock_free as elided.
-   ADAPT_COUNT is a pointer to per-lock state variable. */
-
+/* Get the new value of adapt_count according to the elision
+   configurations.  Returns true if the system should retry again or false
+   otherwise.  */
 static inline bool
-__elide_lock (uint8_t *adapt_count, int is_lock_free)
+__get_new_count (uint8_t *adapt_count)
 {
-  if (*adapt_count > 0)
+  /* A persistent failure indicates that a retry will probably
+     result in another failure.  Use normal locking now and
+     for the next couple of calls.  */
+  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
     {
-      (*adapt_count)--;
+      if (__elision_aconf.skip_lock_internal_abort > 0)
+	*adapt_count = __elision_aconf.skip_lock_internal_abort;
       return false;
     }
-
-  for (int i = __elision_aconf.try_tbegin; i > 0; i--)
-    {
-      if (__builtin_tbegin (0))
-	{
-	  if (is_lock_free)
-	    return true;
-	  /* Lock was busy.  */
-	  __builtin_tabort (_ABORT_LOCK_BUSY);
-	}
-      else
-	{
-	  /* A persistent failure indicates that a retry will probably
-	     result in another failure.  Use normal locking now and
-	     for the next couple of calls.  */
-	  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
-	    {
-	      if (__elision_aconf.skip_lock_internal_abort > 0)
-		*adapt_count = __elision_aconf.skip_lock_internal_abort;
-	      break;
-	    }
-	  /* Same logic as above, but for a number of temporary failures in a
-	     a row.  */
-	  else if (__elision_aconf.skip_lock_out_of_tbegin_retries > 0
-		   && __elision_aconf.try_tbegin > 0)
-	    *adapt_count = __elision_aconf.skip_lock_out_of_tbegin_retries;
-	}
-     }
-
-  return false;
+  /* Same logic as above, but for a number of temporary failures in a
+     a row.  */
+  else if (__elision_aconf.skip_lock_out_of_tbegin_retries > 0
+	   && __elision_aconf.try_tbegin > 0)
+    *adapt_count = __elision_aconf.skip_lock_out_of_tbegin_retries;
+  return true;
 }
 
-# define ELIDE_LOCK(adapt_count, is_lock_free) \
-  __elide_lock (&(adapt_count), is_lock_free)
-
-
-static inline bool
-__elide_trylock (uint8_t *adapt_count, int is_lock_free, int write)
-{
-  if (__elision_aconf.try_tbegin > 0)
-    {
-      if (write)
-	__builtin_tabort (_ABORT_NESTED_TRYLOCK);
-      return __elide_lock (adapt_count, is_lock_free);
-    }
-  return false;
-}
+/* CONCURRENCY NOTES:
+
+   Always evaluate is_lock_free from inside a transaction in macros
+   ELIDE_LOCK and ELIDE_TRYLOCK.
+   Failing to do so, creates a race condition to the memory access specified
+   in is_lock_free which can be triggered with the following order of
+   events:
+   1. The lock accessed by is_lock_free is free.
+   2. Thread T1 evaluates is_lock_free and stores into register R1 that the
+      lock is free.
+   3. Thread T2 acquires the same lock used in is_lock_free.
+   4. T1 begins the transaction, creating a memory barrier where is_lock_free
+      is false, but R1 is true.
+   5. T1 reads R1 and doesn't abort the transaction.
+   6. T1 calls ELIDE_UNLOCK, which reads false from is_lock_free and decides
+      to unlock a lock acquired by T2, leading to undefined behavior.  */
+
+/* Returns 0 if the lock defined by is_lock_free was elided.
+   ADAPT_COUNT is a per-lock state variable.  */
+# define ELIDE_LOCK(adapt_count, is_lock_free)				\
+  ({									\
+    int ret = 0;							\
+    if (adapt_count > 0)						\
+      (adapt_count)--;							\
+    else								\
+      for (int i = __elision_aconf.try_tbegin; i > 0; i--)		\
+	{								\
+	  if (__builtin_tbegin (0))					\
+	    {								\
+	      if (is_lock_free)						\
+		{							\
+		  ret = 1;						\
+		  break;						\
+		}							\
+	      __builtin_tabort (_ABORT_LOCK_BUSY);			\
+	    }								\
+	  else								\
+	    if (!__get_new_count(&adapt_count))				\
+	      break;							\
+	}								\
+    ret;								\
+  })
 
 # define ELIDE_TRYLOCK(adapt_count, is_lock_free, write)	\
-  __elide_trylock (&(adapt_count), is_lock_free, write)
+  ({								\
+    int ret = 0;						\
+    if (__elision_aconf.try_tbegin > 0)				\
+      {								\
+	if (write)						\
+	  __builtin_tabort (_ABORT_NESTED_TRYLOCK);		\
+	ret = ELIDE_LOCK (adapt_count, is_lock_free);		\
+      }								\
+    ret;							\
+  })
 
 
 static inline bool