[BZ,#21778] Fix oversight in robust mutex lock acquisition.

Message ID 1501018343.30433.50.camel@redhat.com
State New, archived
Headers

Commit Message

Torvald Riegel July 25, 2017, 9:32 p.m. UTC
  65810f0ef05e8c9e333f17a44e77808b163ca298 fixed a robust mutex bug but
introduced BZ 21778: if the CAS used to try to acquire a lock fails, the
expected value is not updated, which breaks other cases in the lock
acquisition loop.  The fix is to simply update the expected value with
the value returned by the CAS, which ensures that behavior is as if the
first case with the CAS never happened (if the CAS fails).

This is a regression introduced in the last release, so it would be good
to get this included in this release.  I'll likely be AFK on Thursday,
so please just commit this once it has been approved.  Tested on
x86_64-linux.


	[BZ 21778]
	* nptl/pthread_mutex_timedlock.c (__pthread_mutex_timedlock): Update
	oldval if the CAS fails.
	* nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
	* nptl/tst-mutex7.c (ROBUST, DELAY_NSEC, ROUNDS, N): New.
	(tf, do_test): Use them.
	* nptl/tst-mutex7robust.c: New file.
	* nptl/Makefile (tests): Add new test.
  

Comments

Siddhesh Poyarekar July 28, 2017, 12:41 p.m. UTC | #1
On Wednesday 26 July 2017 03:02 AM, Torvald Riegel wrote:
> 65810f0ef05e8c9e333f17a44e77808b163ca298 fixed a robust mutex bug but
> introduced BZ 21778: if the CAS used to try to acquire a lock fails, the
> expected value is not updated, which breaks other cases in the lock
> acquisition loop.  The fix is to simply update the expected value with
> the value returned by the CAS, which ensures that behavior is as if the
> first case with the CAS never happened (if the CAS fails).
> 
> This is a regression introduced in the last release, so it would be good
> to get this included in this release.  I'll likely be AFK on Thursday,
> so please just commit this once it has been approved.  Tested on
> x86_64-linux.
> 
> 
> 	[BZ 21778]
> 	* nptl/pthread_mutex_timedlock.c (__pthread_mutex_timedlock): Update
> 	oldval if the CAS fails.
> 	* nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
> 	* nptl/tst-mutex7.c (ROBUST, DELAY_NSEC, ROUNDS, N): New.
> 	(tf, do_test): Use them.
> 	* nptl/tst-mutex7robust.c: New file.
> 	* nptl/Makefile (tests): Add new test.
> 

Looks good to me, but I think Carlos is also reviewing this, so please
wait for his confirmation before you commit.

Siddhesh

> 
> robust-mutex-21778.patch
> 
> 
> commit 77e7d626fa6d7f8800c9658e65e43d13c5cdc81d
> Author: Torvald Riegel <triegel@redhat.com>
> Date:   Mon Jul 24 22:53:38 2017 +0200
> 
>     Fix oversight in robust mutex lock acquisition.
>     
>     65810f0ef05e8c9e333f17a44e77808b163ca298 fixed a robust mutex bug but
>     introduced BZ 21778: if the CAS used to try to acquire a lock fails,
>     the expected value is not updated, which breaks other cases in the lock
>     acquisition loop.  The fix is to simply update the expected value with
>     the value returned by the CAS, which ensures that behavior is as if the
>     first case with the CAS never happened.
>     
>     2017-07-25  Torvald Riegel  <triegel@redhat.com>
>     
>     	[BZ 21778]
>     	* nptl/pthread_mutex_timedlock.c (__pthread_mutex_timedlock): Update
>     	oldval if the CAS fails.
>     	* nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
>     	* nptl/tst-mutex7.c (ROBUST, DELAY_NSEC, ROUNDS, N): New.
>     	(tf, do_test): Use them.
>     	* nptl/tst-mutex7robust.c: New file.
>     	* nptl/Makefile (tests): Add new test.
> 
> diff --git a/nptl/Makefile b/nptl/Makefile
> index dd01994..bca09bf 100644
> --- a/nptl/Makefile
> +++ b/nptl/Makefile
> @@ -230,7 +230,7 @@ LDLIBS-tst-thread_local1 = -lstdc++
>  
>  tests = tst-attr1 tst-attr2 tst-attr3 tst-default-attr \
>  	tst-mutex1 tst-mutex2 tst-mutex3 tst-mutex4 tst-mutex5 tst-mutex6 \
> -	tst-mutex7 tst-mutex9 tst-mutex5a tst-mutex7a \
> +	tst-mutex7 tst-mutex9 tst-mutex5a tst-mutex7a tst-mutex7robust \
>  	tst-mutexpi1 tst-mutexpi2 tst-mutexpi3 tst-mutexpi4 tst-mutexpi5 \
>  	tst-mutexpi5a tst-mutexpi6 tst-mutexpi7 tst-mutexpi7a \
>  	tst-mutexpi9 \
> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> index b76475b..8c48503 100644
> --- a/nptl/pthread_mutex_lock.c
> +++ b/nptl/pthread_mutex_lock.c
> @@ -197,11 +197,14 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
>  	{
>  	  /* Try to acquire the lock through a CAS from 0 (not acquired) to
>  	     our TID | assume_other_futex_waiters.  */
> -	  if (__glibc_likely ((oldval == 0)
> -			      && (atomic_compare_and_exchange_bool_acq
> -				  (&mutex->__data.__lock,
> -				   id | assume_other_futex_waiters, 0) == 0)))
> -	      break;
> +	  if (__glibc_likely (oldval == 0))
> +	    {
> +	      oldval
> +	        = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
> +	            id | assume_other_futex_waiters, 0);
> +	      if (__glibc_likely (oldval == 0))
> +		break;
> +	    }
>  
>  	  if ((oldval & FUTEX_OWNER_DIED) != 0)
>  	    {
> diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
> index be53381..d5ec314 100644
> --- a/nptl/pthread_mutex_timedlock.c
> +++ b/nptl/pthread_mutex_timedlock.c
> @@ -154,11 +154,14 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
>  	{
>  	  /* Try to acquire the lock through a CAS from 0 (not acquired) to
>  	     our TID | assume_other_futex_waiters.  */
> -	  if (__glibc_likely ((oldval == 0)
> -			      && (atomic_compare_and_exchange_bool_acq
> -				  (&mutex->__data.__lock,
> -				   id | assume_other_futex_waiters, 0) == 0)))
> -	      break;
> +	  if (__glibc_likely (oldval == 0))
> +	    {
> +	      oldval
> +	        = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
> +	            id | assume_other_futex_waiters, 0);
> +	      if (__glibc_likely (oldval == 0))
> +		break;
> +	    }
>  
>  	  if ((oldval & FUTEX_OWNER_DIED) != 0)
>  	    {
> diff --git a/nptl/tst-mutex7.c b/nptl/tst-mutex7.c
> index a11afdb..7990409 100644
> --- a/nptl/tst-mutex7.c
> +++ b/nptl/tst-mutex7.c
> @@ -26,13 +26,22 @@
>  #ifndef TYPE
>  # define TYPE PTHREAD_MUTEX_DEFAULT
>  #endif
> -
> +#ifndef ROBUST
> +# define ROBUST PTHREAD_MUTEX_STALLED
> +#endif
> +#ifndef DELAY_NSEC
> +# define DELAY_NSEC 11000
> +#endif
> +#ifndef ROUNDS
> +# define ROUNDS 1000
> +#endif
> +#ifndef N
> +# define N 100
> +#endif
>  
>  static pthread_mutex_t lock;
>  
>  
> -#define ROUNDS 1000
> -#define N 100
>  
>  
>  static void *
> @@ -40,7 +49,7 @@ tf (void *arg)
>  {
>    int nr = (long int) arg;
>    int cnt;
> -  struct timespec ts = { .tv_sec = 0, .tv_nsec = 11000 };
> +  struct timespec ts = { .tv_sec = 0, .tv_nsec = DELAY_NSEC };
>  
>    for (cnt = 0; cnt < ROUNDS; ++cnt)
>      {
> @@ -56,7 +65,8 @@ tf (void *arg)
>  	  return (void *) 1l;
>  	}
>  
> -      nanosleep (&ts, NULL);
> +      if ((ts.tv_sec > 0) || (ts.tv_nsec > 0))
> +	nanosleep (&ts, NULL);
>      }
>  
>    return NULL;
> @@ -80,6 +90,12 @@ do_test (void)
>        exit (1);
>      }
>  
> +  if (pthread_mutexattr_setrobust (&a, ROBUST) != 0)
> +    {
> +      puts ("mutexattr_setrobust failed");
> +      exit (1);
> +    }
> +
>  #ifdef ENABLE_PI
>    if (pthread_mutexattr_setprotocol (&a, PTHREAD_PRIO_INHERIT) != 0)
>      {
> diff --git a/nptl/tst-mutex7robust.c b/nptl/tst-mutex7robust.c
> new file mode 100644
> index 0000000..73eb66d
> --- /dev/null
> +++ b/nptl/tst-mutex7robust.c
> @@ -0,0 +1,6 @@
> +#define TYPE PTHREAD_MUTEX_NORMAL
> +#define ROBUST PTHREAD_MUTEX_ROBUST
> +#define DELAY_NSEC 0
> +#define ROUNDS 1000
> +#define N 32
> +#include "tst-mutex7.c"
>
  
Carlos O'Donell July 28, 2017, 10:47 p.m. UTC | #2
On 07/28/2017 08:41 AM, Siddhesh Poyarekar wrote:
> On Wednesday 26 July 2017 03:02 AM, Torvald Riegel wrote:
>> 65810f0ef05e8c9e333f17a44e77808b163ca298 fixed a robust mutex bug but
>> introduced BZ 21778: if the CAS used to try to acquire a lock fails, the
>> expected value is not updated, which breaks other cases in the lock
>> acquisition loop.  The fix is to simply update the expected value with
>> the value returned by the CAS, which ensures that behavior is as if the
>> first case with the CAS never happened (if the CAS fails).
>>
>> This is a regression introduced in the last release, so it would be good
>> to get this included in this release.  I'll likely be AFK on Thursday,
>> so please just commit this once it has been approved.  Tested on
>> x86_64-linux.
>>
>>
>> 	[BZ 21778]
>> 	* nptl/pthread_mutex_timedlock.c (__pthread_mutex_timedlock): Update
>> 	oldval if the CAS fails.
>> 	* nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
>> 	* nptl/tst-mutex7.c (ROBUST, DELAY_NSEC, ROUNDS, N): New.
>> 	(tf, do_test): Use them.
>> 	* nptl/tst-mutex7robust.c: New file.
>> 	* nptl/Makefile (tests): Add new test.
>>
> 
> Looks good to me, but I think Carlos is also reviewing this, so please
> wait for his confirmation before you commit.

So far I've finished confirming this fix on:
- x86_64
- i686
- ppc64

With the following targets still lagging:
- s390x
- aarch64
- ppc64le
- armv7hl

I'd like to see those 4 last targets complete their test cycle before
I commit this change.

I've got a v2 which basically just adds some more comments to the testsuite.

The bug is 100% reproducible and easily with the small critical section and
the lock/unlock sequence of 100 threads, we hit the CAS very quickly, fail
to set oldval and everything deadlocks. It's always easier to validate when
the failure is in a high contention path, because creating contention is easy :-)

Would you be opposed if I commit this slightly *after* the midnight UTC hard
freeze? That way I can confirm the other four arches are OK? I expect that with
the coverage I have here it won't break anyone else (and the patch is clearly
correct, but I would like to really double check).

Cheers,
Carlos.
  
Carlos O'Donell July 29, 2017, 4:06 a.m. UTC | #3
On 07/28/2017 06:47 PM, Carlos O'Donell wrote:
> On 07/28/2017 08:41 AM, Siddhesh Poyarekar wrote:
>> On Wednesday 26 July 2017 03:02 AM, Torvald Riegel wrote:
>>> 65810f0ef05e8c9e333f17a44e77808b163ca298 fixed a robust mutex bug but
>>> introduced BZ 21778: if the CAS used to try to acquire a lock fails, the
>>> expected value is not updated, which breaks other cases in the lock
>>> acquisition loop.  The fix is to simply update the expected value with
>>> the value returned by the CAS, which ensures that behavior is as if the
>>> first case with the CAS never happened (if the CAS fails).
>>>
>>> This is a regression introduced in the last release, so it would be good
>>> to get this included in this release.  I'll likely be AFK on Thursday,
>>> so please just commit this once it has been approved.  Tested on
>>> x86_64-linux.
>>>
>>>
>>> 	[BZ 21778]
>>> 	* nptl/pthread_mutex_timedlock.c (__pthread_mutex_timedlock): Update
>>> 	oldval if the CAS fails.
>>> 	* nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
>>> 	* nptl/tst-mutex7.c (ROBUST, DELAY_NSEC, ROUNDS, N): New.
>>> 	(tf, do_test): Use them.
>>> 	* nptl/tst-mutex7robust.c: New file.
>>> 	* nptl/Makefile (tests): Add new test.
>>>
>>
>> Looks good to me, but I think Carlos is also reviewing this, so please
>> wait for his confirmation before you commit.
> 
> So far I've finished confirming this fix on:
> - x86_64
> - i686
> - ppc64
> 
> With the following targets still lagging:
> - s390x
> - aarch64
> - ppc64le
> - armv7hl

Done. All 7 machines passed the aggressive regression test and validation.
I was able to reproduce the issue on all of the machines and fix it with
the patch.

Bug 21778 is now fixed.

Cheers,
Carlos.
  
Siddhesh Poyarekar July 29, 2017, 4:28 a.m. UTC | #4
On Saturday 29 July 2017 04:17 AM, Carlos O'Donell wrote:
> Would you be opposed if I commit this slightly *after* the midnight UTC hard
> freeze? That way I can confirm the other four arches are OK? I expect that with
> the coverage I have here it won't break anyone else (and the patch is clearly
> correct, but I would like to really double check).

Sure, this one is already agreed upon.

Siddhesh
  
Carlos O'Donell July 29, 2017, 4:35 a.m. UTC | #5
On 07/29/2017 12:28 AM, Siddhesh Poyarekar wrote:
> On Saturday 29 July 2017 04:17 AM, Carlos O'Donell wrote:
>> Would you be opposed if I commit this slightly *after* the midnight UTC hard
>> freeze? That way I can confirm the other four arches are OK? I expect that with
>> the coverage I have here it won't break anyone else (and the patch is clearly
>> correct, but I would like to really double check).
> 
> Sure, this one is already agreed upon.

Thanks. I committed without your explicit approval, expecting that your review,
and my testing were near enough to the freeze to be OK.

I promise not to fix anything else ;-)

My eyes are tired from code review and multi-machine testing :}

Cheers,
Carlos.
  

Patch

commit 77e7d626fa6d7f8800c9658e65e43d13c5cdc81d
Author: Torvald Riegel <triegel@redhat.com>
Date:   Mon Jul 24 22:53:38 2017 +0200

    Fix oversight in robust mutex lock acquisition.
    
    65810f0ef05e8c9e333f17a44e77808b163ca298 fixed a robust mutex bug but
    introduced BZ 21778: if the CAS used to try to acquire a lock fails,
    the expected value is not updated, which breaks other cases in the lock
    acquisition loop.  The fix is to simply update the expected value with
    the value returned by the CAS, which ensures that behavior is as if the
    first case with the CAS never happened.
    
    2017-07-25  Torvald Riegel  <triegel@redhat.com>
    
    	[BZ 21778]
    	* nptl/pthread_mutex_timedlock.c (__pthread_mutex_timedlock): Update
    	oldval if the CAS fails.
    	* nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
    	* nptl/tst-mutex7.c (ROBUST, DELAY_NSEC, ROUNDS, N): New.
    	(tf, do_test): Use them.
    	* nptl/tst-mutex7robust.c: New file.
    	* nptl/Makefile (tests): Add new test.

diff --git a/nptl/Makefile b/nptl/Makefile
index dd01994..bca09bf 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -230,7 +230,7 @@  LDLIBS-tst-thread_local1 = -lstdc++
 
 tests = tst-attr1 tst-attr2 tst-attr3 tst-default-attr \
 	tst-mutex1 tst-mutex2 tst-mutex3 tst-mutex4 tst-mutex5 tst-mutex6 \
-	tst-mutex7 tst-mutex9 tst-mutex5a tst-mutex7a \
+	tst-mutex7 tst-mutex9 tst-mutex5a tst-mutex7a tst-mutex7robust \
 	tst-mutexpi1 tst-mutexpi2 tst-mutexpi3 tst-mutexpi4 tst-mutexpi5 \
 	tst-mutexpi5a tst-mutexpi6 tst-mutexpi7 tst-mutexpi7a \
 	tst-mutexpi9 \
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index b76475b..8c48503 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -197,11 +197,14 @@  __pthread_mutex_lock_full (pthread_mutex_t *mutex)
 	{
 	  /* Try to acquire the lock through a CAS from 0 (not acquired) to
 	     our TID | assume_other_futex_waiters.  */
-	  if (__glibc_likely ((oldval == 0)
-			      && (atomic_compare_and_exchange_bool_acq
-				  (&mutex->__data.__lock,
-				   id | assume_other_futex_waiters, 0) == 0)))
-	      break;
+	  if (__glibc_likely (oldval == 0))
+	    {
+	      oldval
+	        = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+	            id | assume_other_futex_waiters, 0);
+	      if (__glibc_likely (oldval == 0))
+		break;
+	    }
 
 	  if ((oldval & FUTEX_OWNER_DIED) != 0)
 	    {
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
index be53381..d5ec314 100644
--- a/nptl/pthread_mutex_timedlock.c
+++ b/nptl/pthread_mutex_timedlock.c
@@ -154,11 +154,14 @@  __pthread_mutex_timedlock (pthread_mutex_t *mutex,
 	{
 	  /* Try to acquire the lock through a CAS from 0 (not acquired) to
 	     our TID | assume_other_futex_waiters.  */
-	  if (__glibc_likely ((oldval == 0)
-			      && (atomic_compare_and_exchange_bool_acq
-				  (&mutex->__data.__lock,
-				   id | assume_other_futex_waiters, 0) == 0)))
-	      break;
+	  if (__glibc_likely (oldval == 0))
+	    {
+	      oldval
+	        = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+	            id | assume_other_futex_waiters, 0);
+	      if (__glibc_likely (oldval == 0))
+		break;
+	    }
 
 	  if ((oldval & FUTEX_OWNER_DIED) != 0)
 	    {
diff --git a/nptl/tst-mutex7.c b/nptl/tst-mutex7.c
index a11afdb..7990409 100644
--- a/nptl/tst-mutex7.c
+++ b/nptl/tst-mutex7.c
@@ -26,13 +26,22 @@ 
 #ifndef TYPE
 # define TYPE PTHREAD_MUTEX_DEFAULT
 #endif
-
+#ifndef ROBUST
+# define ROBUST PTHREAD_MUTEX_STALLED
+#endif
+#ifndef DELAY_NSEC
+# define DELAY_NSEC 11000
+#endif
+#ifndef ROUNDS
+# define ROUNDS 1000
+#endif
+#ifndef N
+# define N 100
+#endif
 
 static pthread_mutex_t lock;
 
 
-#define ROUNDS 1000
-#define N 100
 
 
 static void *
@@ -40,7 +49,7 @@  tf (void *arg)
 {
   int nr = (long int) arg;
   int cnt;
-  struct timespec ts = { .tv_sec = 0, .tv_nsec = 11000 };
+  struct timespec ts = { .tv_sec = 0, .tv_nsec = DELAY_NSEC };
 
   for (cnt = 0; cnt < ROUNDS; ++cnt)
     {
@@ -56,7 +65,8 @@  tf (void *arg)
 	  return (void *) 1l;
 	}
 
-      nanosleep (&ts, NULL);
+      if ((ts.tv_sec > 0) || (ts.tv_nsec > 0))
+	nanosleep (&ts, NULL);
     }
 
   return NULL;
@@ -80,6 +90,12 @@  do_test (void)
       exit (1);
     }
 
+  if (pthread_mutexattr_setrobust (&a, ROBUST) != 0)
+    {
+      puts ("mutexattr_setrobust failed");
+      exit (1);
+    }
+
 #ifdef ENABLE_PI
   if (pthread_mutexattr_setprotocol (&a, PTHREAD_PRIO_INHERIT) != 0)
     {
diff --git a/nptl/tst-mutex7robust.c b/nptl/tst-mutex7robust.c
new file mode 100644
index 0000000..73eb66d
--- /dev/null
+++ b/nptl/tst-mutex7robust.c
@@ -0,0 +1,6 @@ 
+#define TYPE PTHREAD_MUTEX_NORMAL
+#define ROBUST PTHREAD_MUTEX_ROBUST
+#define DELAY_NSEC 0
+#define ROUNDS 1000
+#define N 32
+#include "tst-mutex7.c"