diff mbox series

[1/3] nptl: Extract <bits/atomic_wide_counter.h> from pthread_cond_common.c

Message ID d02a4b7aff0e9bec7abae5e15d9ae3ab31f025ba.1635954168.git.fweimer@redhat.com
State Committed
Commit 8bd336a00a5311bf7a9e99b3b0e9f01ff5faa74b
Headers show
Series Add _dl_find_eh_frame function for unwinder optimization | expand

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Florian Weimer Nov. 3, 2021, 4:27 p.m. UTC
And make it an installed header.  This addresses a few aliasing
violations (which do not seem to result in miscompilation due to
the use of atomics), and also enables use of wide counters in other
parts of the library.

The debug output in nptl/tst-cond22 has been adjusted to print
the 32-bit values instead because it avoids a big-endian/little-endian
difference.
---
 bits/atomic_wide_counter.h              |  35 ++++
 include/atomic_wide_counter.h           |  89 +++++++++++
 include/bits/atomic_wide_counter.h      |   1 +
 misc/Makefile                           |   3 +-
 misc/atomic_wide_counter.c              | 127 +++++++++++++++
 nptl/Makefile                           |  13 +-
 nptl/pthread_cond_common.c              | 204 ++++--------------------
 nptl/tst-cond22.c                       |  14 +-
 sysdeps/nptl/bits/thread-shared-types.h |  22 +--
 9 files changed, 310 insertions(+), 198 deletions(-)
 create mode 100644 bits/atomic_wide_counter.h
 create mode 100644 include/atomic_wide_counter.h
 create mode 100644 include/bits/atomic_wide_counter.h
 create mode 100644 misc/atomic_wide_counter.c

Comments

Adhemerval Zanella Nov. 15, 2021, 7:24 p.m. UTC | #1
On 03/11/2021 13:27, Florian Weimer via Libc-alpha wrote:
> And make it an installed header.  This addresses a few aliasing
> violations (which do not seem to result in miscompilation due to
> the use of atomics), and also enables use of wide counters in other
> parts of the library.
> 
> The debug output in nptl/tst-cond22 has been adjusted to print
> the 32-bit values instead because it avoids a big-endian/little-endian
> difference.

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
>  bits/atomic_wide_counter.h              |  35 ++++
>  include/atomic_wide_counter.h           |  89 +++++++++++
>  include/bits/atomic_wide_counter.h      |   1 +
>  misc/Makefile                           |   3 +-
>  misc/atomic_wide_counter.c              | 127 +++++++++++++++
>  nptl/Makefile                           |  13 +-
>  nptl/pthread_cond_common.c              | 204 ++++--------------------
>  nptl/tst-cond22.c                       |  14 +-
>  sysdeps/nptl/bits/thread-shared-types.h |  22 +--
>  9 files changed, 310 insertions(+), 198 deletions(-)
>  create mode 100644 bits/atomic_wide_counter.h
>  create mode 100644 include/atomic_wide_counter.h
>  create mode 100644 include/bits/atomic_wide_counter.h
>  create mode 100644 misc/atomic_wide_counter.c
> 
> diff --git a/bits/atomic_wide_counter.h b/bits/atomic_wide_counter.h
> new file mode 100644
> index 0000000000..0687eb554e
> --- /dev/null
> +++ b/bits/atomic_wide_counter.h
> @@ -0,0 +1,35 @@
> +/* Monotonically increasing wide counters (at least 62 bits).
> +   Copyright (C) 2016-2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _BITS_ATOMIC_WIDE_COUNTER_H
> +#define _BITS_ATOMIC_WIDE_COUNTER_H
> +
> +/* Counter that is monotonically increasing (by less than 2**31 per
> +   increment), with a single writer, and an arbitrary number of
> +   readers.  */
> +typedef union
> +{
> +  __extension__ unsigned long long int __value64;
> +  struct
> +  {
> +    unsigned int __low;
> +    unsigned int __high;
> +  } __value32;
> +} __atomic_wide_counter;
> +
> +#endif /* _BITS_ATOMIC_WIDE_COUNTER_H */

Ok, it would be included in multiple places so we can't tie to a specific
header.

> diff --git a/include/atomic_wide_counter.h b/include/atomic_wide_counter.h
> new file mode 100644
> index 0000000000..31f009d5e6
> --- /dev/null
> +++ b/include/atomic_wide_counter.h
> @@ -0,0 +1,89 @@
> +/* Monotonically increasing wide counters (at least 62 bits).
> +   Copyright (C) 2016-2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _ATOMIC_WIDE_COUNTER_H
> +#define _ATOMIC_WIDE_COUNTER_H
> +
> +#include <atomic.h>
> +#include <bits/atomic_wide_counter.h>
> +
> +#if __HAVE_64B_ATOMICS
> +
> +static inline uint64_t
> +__atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
> +{
> +  return atomic_load_relaxed (&c->__value64);
> +}
> +
> +static inline uint64_t
> +__atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
> +                                         unsigned int val)
> +{
> +  return atomic_fetch_add_relaxed (&c->__value64, val);
> +}
> +
> +static inline uint64_t
> +__atomic_wide_counter_fetch_add_acquire (__atomic_wide_counter *c,
> +                                         unsigned int val)
> +{
> +  return atomic_fetch_add_acquire (&c->__value64, val);
> +}
> +
> +static inline void
> +__atomic_wide_counter_add_relaxed (__atomic_wide_counter *c,
> +                                   unsigned int val)
> +{
> +  atomic_store_relaxed (&c->__value64,
> +                        atomic_load_relaxed (&c->__value64) + val);
> +}
> +
> +static uint64_t __attribute__ ((unused))
> +__atomic_wide_counter_fetch_xor_release (__atomic_wide_counter *c,
> +                                         unsigned int val)
> +{
> +  return atomic_fetch_xor_release (&c->__value64, val);
> +}
> +
> +#else /* !__HAVE_64B_ATOMICS */
> +
> +uint64_t __atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
> +  attribute_hidden;
> +
> +uint64_t __atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
> +                                                  unsigned int op)
> +  attribute_hidden;
> +
> +static inline uint64_t
> +__atomic_wide_counter_fetch_add_acquire (__atomic_wide_counter *c,
> +                                         unsigned int val)
> +{
> +  uint64_t r = __atomic_wide_counter_fetch_add_relaxed (c, val);
> +  atomic_thread_fence_acquire ();
> +  return r;
> +}
> +
> +static inline void
> +__atomic_wide_counter_add_relaxed (__atomic_wide_counter *c,
> +                                   unsigned int val)
> +{
> +  __atomic_wide_counter_fetch_add_relaxed (c, val);
> +}
> +
> +#endif /* !__HAVE_64B_ATOMICS */
> +
> +#endif /* _ATOMIC_WIDE_COUNTER_H */

Ok.

> diff --git a/include/bits/atomic_wide_counter.h b/include/bits/atomic_wide_counter.h
> new file mode 100644
> index 0000000000..8fb09a5291
> --- /dev/null
> +++ b/include/bits/atomic_wide_counter.h
> @@ -0,0 +1 @@
> +#include_next <bits/atomic_wide_counter.h>

Ok.

> diff --git a/misc/Makefile b/misc/Makefile
> index 1083ba3bfc..3b66cb9f6a 100644
> --- a/misc/Makefile
> +++ b/misc/Makefile
> @@ -73,7 +73,8 @@ routines := brk sbrk sstk ioctl \
>  	    fgetxattr flistxattr fremovexattr fsetxattr getxattr \
>  	    listxattr lgetxattr llistxattr lremovexattr lsetxattr \
>  	    removexattr setxattr getauxval ifunc-impl-list makedev \
> -	    allocate_once fd_to_filename single_threaded unwind-link
> +	    allocate_once fd_to_filename single_threaded unwind-link \
> +	    atomic_wide_counter
>  
>  generated += tst-error1.mtrace tst-error1-mem.out \
>    tst-allocate_once.mtrace tst-allocate_once-mem.out

Ok.

> diff --git a/misc/atomic_wide_counter.c b/misc/atomic_wide_counter.c
> new file mode 100644
> index 0000000000..56d8981925
> --- /dev/null
> +++ b/misc/atomic_wide_counter.c
> @@ -0,0 +1,127 @@
> +/* Monotonically increasing wide counters (at least 62 bits).
> +   Copyright (C) 2016-2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <atomic_wide_counter.h>
> +
> +#if !__HAVE_64B_ATOMICS
> +
> +/* Values we add or xor are less than or equal to 1<<31, so we only
> +   have to make overflow-and-addition atomic wrt. to concurrent load
> +   operations and xor operations.  To do that, we split each counter
> +   into two 32b values of which we reserve the MSB of each to
> +   represent an overflow from the lower-order half to the higher-order
> +   half.
> +
> +   In the common case, the state is (higher-order / lower-order half, and . is
> +   basically concatenation of the bits):
> +   0.h     / 0.l  = h.l
> +
> +   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
> +   following steps S1-S4 (the values these represent are on the right-hand
> +   side):
> +   S1:  0.h     / 1.L == (h+1).L
> +   S2:  1.(h+1) / 1.L == (h+1).L
> +   S3:  1.(h+1) / 0.L == (h+1).L
> +   S4:  0.(h+1) / 0.L == (h+1).L
> +   If the LSB of the higher-order half is set, readers will ignore the
> +   overflow bit in the lower-order half.
> +
> +   To get an atomic snapshot in load operations, we exploit that the
> +   higher-order half is monotonically increasing; if we load a value V from
> +   it, then read the lower-order half, and then read the higher-order half
> +   again and see the same value V, we know that both halves have existed in
> +   the sequence of values the full counter had.  This is similar to the
> +   validated reads in the time-based STMs in GCC's libitm (e.g.,
> +   method_ml_wt).
> +
> +   One benefit of this scheme is that this makes load operations
> +   obstruction-free because unlike if we would just lock the counter, readers
> +   can almost always interpret a snapshot of each halves.  Readers can be
> +   forced to read a new snapshot when the read is concurrent with an overflow.
> +   However, overflows will happen infrequently, so load operations are
> +   practically lock-free.  */
> +
> +uint64_t
> +__atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
> +                                         unsigned int op)
> +{
> +  /* S1. Note that this is an atomic read-modify-write so it extends the
> +     release sequence of release MO store at S3.  */
> +  unsigned int l = atomic_fetch_add_relaxed (&c->__value32.__low, op);
> +  unsigned int h = atomic_load_relaxed (&c->__value32.__high);
> +  uint64_t result = ((uint64_t) h << 31) | l;
> +  l += op;
> +  if ((l >> 31) > 0)
> +    {
> +      /* Overflow.  Need to increment higher-order half.  Note that all
> +         add operations are ordered in happens-before.  */
> +      h++;
> +      /* S2. Release MO to synchronize with the loads of the higher-order half
> +         in the load operation.  See __condvar_load_64_relaxed.  */
> +      atomic_store_release (&c->__value32.__high,
> +                            h | ((unsigned int) 1 << 31));
> +      l ^= (unsigned int) 1 << 31;
> +      /* S3.  See __condvar_load_64_relaxed.  */
> +      atomic_store_release (&c->__value32.__low, l);
> +      /* S4.  Likewise.  */
> +      atomic_store_release (&c->__value32.__high, h);
> +    }
> +  return result;
> +}
> +
> +uint64_t
> +__atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
> +{
> +  unsigned int h, l, h2;
> +  do
> +    {
> +      /* This load and the second one below to the same location read from the
> +         stores in the overflow handling of the add operation or the
> +         initializing stores (which is a simple special case because
> +         initialization always completely happens before further use).
> +         Because no two stores to the higher-order half write the same value,
> +         the loop ensures that if we continue to use the snapshot, this load
> +         and the second one read from the same store operation.  All candidate
> +         store operations have release MO.
> +         If we read from S2 in the first load, then we will see the value of
> +         S1 on the next load (because we synchronize with S2), or a value
> +         later in modification order.  We correctly ignore the lower-half's
> +         overflow bit in this case.  If we read from S4, then we will see the
> +         value of S3 in the next load (or a later value), which does not have
> +         the overflow bit set anymore.
> +          */
> +      h = atomic_load_acquire (&c->__value32.__high);
> +      /* This will read from the release sequence of S3 (i.e, either the S3
> +         store or the read-modify-writes at S1 following S3 in modification
> +         order).  Thus, the read synchronizes with S3, and the following load
> +         of the higher-order half will read from the matching S2 (or a later
> +         value).
> +         Thus, if we read a lower-half value here that already overflowed and
> +         belongs to an increased higher-order half value, we will see the
> +         latter and h and h2 will not be equal.  */
> +      l = atomic_load_acquire (&c->__value32.__low);
> +      /* See above.  */
> +      h2 = atomic_load_relaxed (&c->__value32.__high);
> +    }
> +  while (h != h2);
> +  if (((l >> 31) > 0) && ((h >> 31) > 0))
> +    l ^= (unsigned int) 1 << 31;
> +  return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l;
> +}
> +
> +#endif /* !__HAVE_64B_ATOMICS */

Ok, it is basically moving out the code from pthread_cond one.

> diff --git a/nptl/Makefile b/nptl/Makefile
> index ff4d590f11..6310aecaad 100644
> --- a/nptl/Makefile
> +++ b/nptl/Makefile
> @@ -22,8 +22,14 @@ subdir	:= nptl
>  
>  include ../Makeconfig
>  
> -headers := pthread.h semaphore.h bits/semaphore.h \
> -	   bits/struct_mutex.h bits/struct_rwlock.h
> +headers := \
> +  bits/atomic_wide_counter.h \
> +  bits/semaphore.h \
> +  bits/struct_mutex.h \
> +  bits/struct_rwlock.h \
> +  pthread.h \
> +  semaphore.h \
> +  # headers
>  
>  extra-libs := libpthread
>  extra-libs-others := $(extra-libs)

Ok.

> @@ -270,7 +276,7 @@ tests = tst-attr2 tst-attr3 tst-default-attr \
>  	tst-mutexpi1 tst-mutexpi2 tst-mutexpi3 tst-mutexpi4 \
>  	tst-mutexpi5 tst-mutexpi5a tst-mutexpi6 tst-mutexpi7 tst-mutexpi7a \
>  	tst-mutexpi9 tst-mutexpi10 \
> -	tst-cond22 tst-cond26 \
> +	tst-cond26 \
>  	tst-robustpi1 tst-robustpi2 tst-robustpi3 tst-robustpi4 tst-robustpi5 \
>  	tst-robustpi6 tst-robustpi7 tst-robustpi9 \
>  	tst-rwlock2 tst-rwlock2a tst-rwlock2b tst-rwlock3 \
> @@ -319,6 +325,7 @@ tests-internal := tst-robustpi8 tst-rwlock19 tst-rwlock20 \
>  		  tst-barrier5 tst-signal7 tst-mutex8 tst-mutex8-static \
>  		  tst-mutexpi8 tst-mutexpi8-static \
>  		  tst-setgetname \
> +		  tst-cond22 \
>  
>  xtests = tst-setuid1 tst-setuid1-static tst-setuid2 \
>  	tst-mutexpp1 tst-mutexpp6 tst-mutexpp10 tst-setgroups \

Ok.

> diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
> index c35b9ef03a..fb56f93b6e 100644
> --- a/nptl/pthread_cond_common.c
> +++ b/nptl/pthread_cond_common.c
> @@ -17,79 +17,52 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <atomic.h>
> +#include <atomic_wide_counter.h>
>  #include <stdint.h>
>  #include <pthread.h>
>  
> -/* We need 3 least-significant bits on __wrefs for something else.  */
> +/* We need 3 least-significant bits on __wrefs for something else.
> +   This also matches __atomic_wide_counter requirements: The highest
> +   value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to __g1_start
> +   (the two extra bits are for the lock in the two LSBs of
> +   __g1_start).  */
>  #define __PTHREAD_COND_MAX_GROUP_SIZE ((unsigned) 1 << 29)
>  
> -#if __HAVE_64B_ATOMICS == 1
> -
> -static uint64_t __attribute__ ((unused))
> +static inline uint64_t
>  __condvar_load_wseq_relaxed (pthread_cond_t *cond)
>  {
> -  return atomic_load_relaxed (&cond->__data.__wseq);
> +  return __atomic_wide_counter_load_relaxed (&cond->__data.__wseq);
>  }
>  
> -static uint64_t __attribute__ ((unused))
> +static inline uint64_t
>  __condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
>  {
> -  return atomic_fetch_add_acquire (&cond->__data.__wseq, val);
> +  return __atomic_wide_counter_fetch_add_acquire (&cond->__data.__wseq, val);
>  }
>  
> -static uint64_t __attribute__ ((unused))
> -__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
> +static inline uint64_t
> +__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
>  {
> -  return atomic_fetch_xor_release (&cond->__data.__wseq, val);
> +  return __atomic_wide_counter_load_relaxed (&cond->__data.__g1_start);
>  }
>  
> -static uint64_t __attribute__ ((unused))
> -__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
> +static inline void
> +__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
>  {
> -  return atomic_load_relaxed (&cond->__data.__g1_start);
> +  __atomic_wide_counter_add_relaxed (&cond->__data.__g1_start, val);
>  }
>  
> -static void __attribute__ ((unused))
> -__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
> +#if __HAVE_64B_ATOMICS == 1
> +
> +static inline uint64_t
> +__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
>  {
> -  atomic_store_relaxed (&cond->__data.__g1_start,
> -      atomic_load_relaxed (&cond->__data.__g1_start) + val);
> +  return atomic_fetch_xor_release (&cond->__data.__wseq.__value64, val);
>  }
>  

Ok, although the reorganization of the placemente makes review a bit confusing.

> -#else
> -
> -/* We use two 64b counters: __wseq and __g1_start.  They are monotonically
> -   increasing and single-writer-multiple-readers counters, so we can implement
> -   load, fetch-and-add, and fetch-and-xor operations even when we just have
> -   32b atomics.  Values we add or xor are less than or equal to 1<<31 (*),
> -   so we only have to make overflow-and-addition atomic wrt. to concurrent
> -   load operations and xor operations.  To do that, we split each counter into
> -   two 32b values of which we reserve the MSB of each to represent an
> -   overflow from the lower-order half to the higher-order half.
> -
> -   In the common case, the state is (higher-order / lower-order half, and . is
> -   basically concatenation of the bits):
> -   0.h     / 0.l  = h.l
> -
> -   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
> -   following steps S1-S4 (the values these represent are on the right-hand
> -   side):
> -   S1:  0.h     / 1.L == (h+1).L
> -   S2:  1.(h+1) / 1.L == (h+1).L
> -   S3:  1.(h+1) / 0.L == (h+1).L
> -   S4:  0.(h+1) / 0.L == (h+1).L
> -   If the LSB of the higher-order half is set, readers will ignore the
> -   overflow bit in the lower-order half.
> -
> -   To get an atomic snapshot in load operations, we exploit that the
> -   higher-order half is monotonically increasing; if we load a value V from
> -   it, then read the lower-order half, and then read the higher-order half
> -   again and see the same value V, we know that both halves have existed in
> -   the sequence of values the full counter had.  This is similar to the
> -   validated reads in the time-based STMs in GCC's libitm (e.g.,
> -   method_ml_wt).
> -
> -   The xor operation needs to be an atomic read-modify-write.  The write
> +#else /* !__HAVE_64B_ATOMICS */
> +
> +/* The xor operation needs to be an atomic read-modify-write.  The write
>     itself is not an issue as it affects just the lower-order half but not bits
>     used in the add operation.  To make the full fetch-and-xor atomic, we
>     exploit that concurrently, the value can increase by at most 1<<31 (*): The
> @@ -97,117 +70,18 @@ __condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
>     than __PTHREAD_COND_MAX_GROUP_SIZE waiters can enter concurrently and thus
>     increment __wseq.  Therefore, if the xor operation observes a value of
>     __wseq, then the value it applies the modification to later on can be
> -   derived (see below).
> -
> -   One benefit of this scheme is that this makes load operations
> -   obstruction-free because unlike if we would just lock the counter, readers
> -   can almost always interpret a snapshot of each halves.  Readers can be
> -   forced to read a new snapshot when the read is concurrent with an overflow.
> -   However, overflows will happen infrequently, so load operations are
> -   practically lock-free.
> -
> -   (*) The highest value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to
> -   __g1_start (the two extra bits are for the lock in the two LSBs of
> -   __g1_start).  */
> -
> -typedef struct
> -{
> -  unsigned int low;
> -  unsigned int high;
> -} _condvar_lohi;
> -
> -static uint64_t
> -__condvar_fetch_add_64_relaxed (_condvar_lohi *lh, unsigned int op)
> -{
> -  /* S1. Note that this is an atomic read-modify-write so it extends the
> -     release sequence of release MO store at S3.  */
> -  unsigned int l = atomic_fetch_add_relaxed (&lh->low, op);
> -  unsigned int h = atomic_load_relaxed (&lh->high);
> -  uint64_t result = ((uint64_t) h << 31) | l;
> -  l += op;
> -  if ((l >> 31) > 0)
> -    {
> -      /* Overflow.  Need to increment higher-order half.  Note that all
> -	 add operations are ordered in happens-before.  */
> -      h++;
> -      /* S2. Release MO to synchronize with the loads of the higher-order half
> -	 in the load operation.  See __condvar_load_64_relaxed.  */
> -      atomic_store_release (&lh->high, h | ((unsigned int) 1 << 31));
> -      l ^= (unsigned int) 1 << 31;
> -      /* S3.  See __condvar_load_64_relaxed.  */
> -      atomic_store_release (&lh->low, l);
> -      /* S4.  Likewise.  */
> -      atomic_store_release (&lh->high, h);
> -    }
> -  return result;
> -}
> -
> -static uint64_t
> -__condvar_load_64_relaxed (_condvar_lohi *lh)
> -{
> -  unsigned int h, l, h2;
> -  do
> -    {
> -      /* This load and the second one below to the same location read from the
> -	 stores in the overflow handling of the add operation or the
> -	 initializing stores (which is a simple special case because
> -	 initialization always completely happens before further use).
> -	 Because no two stores to the higher-order half write the same value,
> -	 the loop ensures that if we continue to use the snapshot, this load
> -	 and the second one read from the same store operation.  All candidate
> -	 store operations have release MO.
> -	 If we read from S2 in the first load, then we will see the value of
> -	 S1 on the next load (because we synchronize with S2), or a value
> -	 later in modification order.  We correctly ignore the lower-half's
> -	 overflow bit in this case.  If we read from S4, then we will see the
> -	 value of S3 in the next load (or a later value), which does not have
> -	 the overflow bit set anymore.
> -	  */
> -      h = atomic_load_acquire (&lh->high);
> -      /* This will read from the release sequence of S3 (i.e, either the S3
> -	 store or the read-modify-writes at S1 following S3 in modification
> -	 order).  Thus, the read synchronizes with S3, and the following load
> -	 of the higher-order half will read from the matching S2 (or a later
> -	 value).
> -	 Thus, if we read a lower-half value here that already overflowed and
> -	 belongs to an increased higher-order half value, we will see the
> -	 latter and h and h2 will not be equal.  */
> -      l = atomic_load_acquire (&lh->low);
> -      /* See above.  */
> -      h2 = atomic_load_relaxed (&lh->high);
> -    }
> -  while (h != h2);
> -  if (((l >> 31) > 0) && ((h >> 31) > 0))
> -    l ^= (unsigned int) 1 << 31;
> -  return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l;
> -}
> -
> -static uint64_t __attribute__ ((unused))
> -__condvar_load_wseq_relaxed (pthread_cond_t *cond)
> -{
> -  return __condvar_load_64_relaxed ((_condvar_lohi *) &cond->__data.__wseq32);
> -}
> -
> -static uint64_t __attribute__ ((unused))
> -__condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
> -{
> -  uint64_t r = __condvar_fetch_add_64_relaxed
> -      ((_condvar_lohi *) &cond->__data.__wseq32, val);
> -  atomic_thread_fence_acquire ();
> -  return r;
> -}
> +   derived.  */
>  

Ok.

>  static uint64_t __attribute__ ((unused))
>  __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
>  {
> -  _condvar_lohi *lh = (_condvar_lohi *) &cond->__data.__wseq32;
>    /* First, get the current value.  See __condvar_load_64_relaxed.  */
>    unsigned int h, l, h2;
>    do
>      {
> -      h = atomic_load_acquire (&lh->high);
> -      l = atomic_load_acquire (&lh->low);
> -      h2 = atomic_load_relaxed (&lh->high);
> +      h = atomic_load_acquire (&cond->__data.__wseq.__value32.__high);
> +      l = atomic_load_acquire (&cond->__data.__wseq.__value32.__low);
> +      h2 = atomic_load_relaxed (&cond->__data.__wseq.__value32.__high);
>      }
>    while (h != h2);
>    if (((l >> 31) > 0) && ((h >> 31) == 0))

Ok.

> @@ -219,8 +93,9 @@ __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
>       earlier in modification order than the following fetch-xor.
>       This uses release MO to make the full operation have release semantics
>       (all other operations access the lower-order half).  */
> -  unsigned int l2 = atomic_fetch_xor_release (&lh->low, val)
> -      & ~((unsigned int) 1 << 31);
> +  unsigned int l2
> +    = (atomic_fetch_xor_release (&cond->__data.__wseq.__value32.__low, val)
> +       & ~((unsigned int) 1 << 31));
>    if (l2 < l)
>      /* The lower-order half overflowed in the meantime.  This happened exactly
>         once due to the limit on concurrent waiters (see above).  */
> @@ -228,22 +103,7 @@ __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
>    return ((uint64_t) h << 31) + l2;
>  }

Ok.

>  
> -static uint64_t __attribute__ ((unused))
> -__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
> -{
> -  return __condvar_load_64_relaxed
> -      ((_condvar_lohi *) &cond->__data.__g1_start32);
> -}
> -
> -static void __attribute__ ((unused))
> -__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
> -{
> -  ignore_value (__condvar_fetch_add_64_relaxed
> -      ((_condvar_lohi *) &cond->__data.__g1_start32, val));
> -}
> -
> -#endif  /* !__HAVE_64B_ATOMICS  */
> -
> +#endif /* !__HAVE_64B_ATOMICS */
>  
>  /* The lock that signalers use.  See pthread_cond_wait_common for uses.
>     The lock is our normal three-state lock: not acquired (0) / acquired (1) /

Ok.

> diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
> index 64f19ea0a5..1336e9c79d 100644
> --- a/nptl/tst-cond22.c
> +++ b/nptl/tst-cond22.c
> @@ -106,8 +106,11 @@ do_test (void)
>        status = 1;
>      }
>  
> -  printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
> -	  c.__data.__wseq, c.__data.__g1_start,
> +  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
> +	  c.__data.__wseq.__value32.__high,
> +	  c.__data.__wseq.__value32.__low,
> +	  c.__data.__g1_start.__value32.__high,
> +	  c.__data.__g1_start.__value32.__low,
>  	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
>  	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
>  	  c.__data.__g1_orig_size, c.__data.__wrefs);
> @@ -149,8 +152,11 @@ do_test (void)
>        status = 1;
>      }
>  
> -  printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
> -	  c.__data.__wseq, c.__data.__g1_start,
> +  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
> +	  c.__data.__wseq.__value32.__high,
> +	  c.__data.__wseq.__value32.__low,
> +	  c.__data.__g1_start.__value32.__high,
> +	  c.__data.__g1_start.__value32.__low,
>  	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
>  	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
>  	  c.__data.__g1_orig_size, c.__data.__wrefs);

Ok.

> diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
> index 44bf1e358d..b82a79a43e 100644
> --- a/sysdeps/nptl/bits/thread-shared-types.h
> +++ b/sysdeps/nptl/bits/thread-shared-types.h
> @@ -43,6 +43,8 @@
>  
>  #include <bits/pthreadtypes-arch.h>
>  
> +#include <bits/atomic_wide_counter.h>
> +
>  
>  /* Common definition of pthread_mutex_t. */
>  
> @@ -91,24 +93,8 @@ typedef struct __pthread_internal_slist
>  
>  struct __pthread_cond_s
>  {
> -  __extension__ union
> -  {
> -    __extension__ unsigned long long int __wseq;
> -    struct
> -    {
> -      unsigned int __low;
> -      unsigned int __high;
> -    } __wseq32;
> -  };
> -  __extension__ union
> -  {
> -    __extension__ unsigned long long int __g1_start;
> -    struct
> -    {
> -      unsigned int __low;
> -      unsigned int __high;
> -    } __g1_start32;
> -  };
> +  __atomic_wide_counter __wseq;
> +  __atomic_wide_counter __g1_start;
>    unsigned int __g_refs[2] __LOCK_ALIGNMENT;
>    unsigned int __g_size[2];
>    unsigned int __g1_orig_size;
> 

Ok.
Jakub Jelinek Nov. 18, 2021, 1:19 p.m. UTC | #2
On Wed, Nov 03, 2021 at 05:27:42PM +0100, Florian Weimer wrote:
> +      /* S3.  See __condvar_load_64_relaxed.  */

Shouldn't that be See __atomic_wide_counter_load_relaxed ?

> +  if (((l >> 31) > 0) && ((h >> 31) > 0))

Any reason not to write it as (int) l < 0 && (int) h < 0?
Yes, the compiler will likely optimize it that way (probably
even as (int) (l & h) < 0, but still...

	Jakub
Florian Weimer Nov. 18, 2021, 1:23 p.m. UTC | #3
* Jakub Jelinek:

> On Wed, Nov 03, 2021 at 05:27:42PM +0100, Florian Weimer wrote:
>> +      /* S3.  See __condvar_load_64_relaxed.  */
>
> Shouldn't that be See __atomic_wide_counter_load_relaxed ?

I'm going to fix the stale reference, thanks.

>> +  if (((l >> 31) > 0) && ((h >> 31) > 0))
>
> Any reason not to write it as (int) l < 0 && (int) h < 0?

This requires a GCC extension (which active by default, even without
-fwrapv).  Obviously we relay on this extension in many other place, but
still …

Thanks,
Florian
diff mbox series

Patch

diff --git a/bits/atomic_wide_counter.h b/bits/atomic_wide_counter.h
new file mode 100644
index 0000000000..0687eb554e
--- /dev/null
+++ b/bits/atomic_wide_counter.h
@@ -0,0 +1,35 @@ 
+/* Monotonically increasing wide counters (at least 62 bits).
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_ATOMIC_WIDE_COUNTER_H
+#define _BITS_ATOMIC_WIDE_COUNTER_H
+
+/* Counter that is monotonically increasing (by less than 2**31 per
+   increment), with a single writer, and an arbitrary number of
+   readers.  */
+typedef union
+{
+  __extension__ unsigned long long int __value64;
+  struct
+  {
+    unsigned int __low;
+    unsigned int __high;
+  } __value32;
+} __atomic_wide_counter;
+
+#endif /* _BITS_ATOMIC_WIDE_COUNTER_H */
diff --git a/include/atomic_wide_counter.h b/include/atomic_wide_counter.h
new file mode 100644
index 0000000000..31f009d5e6
--- /dev/null
+++ b/include/atomic_wide_counter.h
@@ -0,0 +1,89 @@ 
+/* Monotonically increasing wide counters (at least 62 bits).
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _ATOMIC_WIDE_COUNTER_H
+#define _ATOMIC_WIDE_COUNTER_H
+
+#include <atomic.h>
+#include <bits/atomic_wide_counter.h>
+
+#if __HAVE_64B_ATOMICS
+
+static inline uint64_t
+__atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
+{
+  return atomic_load_relaxed (&c->__value64);
+}
+
+static inline uint64_t
+__atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
+                                         unsigned int val)
+{
+  return atomic_fetch_add_relaxed (&c->__value64, val);
+}
+
+static inline uint64_t
+__atomic_wide_counter_fetch_add_acquire (__atomic_wide_counter *c,
+                                         unsigned int val)
+{
+  return atomic_fetch_add_acquire (&c->__value64, val);
+}
+
+static inline void
+__atomic_wide_counter_add_relaxed (__atomic_wide_counter *c,
+                                   unsigned int val)
+{
+  atomic_store_relaxed (&c->__value64,
+                        atomic_load_relaxed (&c->__value64) + val);
+}
+
+static uint64_t __attribute__ ((unused))
+__atomic_wide_counter_fetch_xor_release (__atomic_wide_counter *c,
+                                         unsigned int val)
+{
+  return atomic_fetch_xor_release (&c->__value64, val);
+}
+
+#else /* !__HAVE_64B_ATOMICS */
+
+uint64_t __atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
+  attribute_hidden;
+
+uint64_t __atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
+                                                  unsigned int op)
+  attribute_hidden;
+
+static inline uint64_t
+__atomic_wide_counter_fetch_add_acquire (__atomic_wide_counter *c,
+                                         unsigned int val)
+{
+  uint64_t r = __atomic_wide_counter_fetch_add_relaxed (c, val);
+  atomic_thread_fence_acquire ();
+  return r;
+}
+
+static inline void
+__atomic_wide_counter_add_relaxed (__atomic_wide_counter *c,
+                                   unsigned int val)
+{
+  __atomic_wide_counter_fetch_add_relaxed (c, val);
+}
+
+#endif /* !__HAVE_64B_ATOMICS */
+
+#endif /* _ATOMIC_WIDE_COUNTER_H */
diff --git a/include/bits/atomic_wide_counter.h b/include/bits/atomic_wide_counter.h
new file mode 100644
index 0000000000..8fb09a5291
--- /dev/null
+++ b/include/bits/atomic_wide_counter.h
@@ -0,0 +1 @@ 
+#include_next <bits/atomic_wide_counter.h>
diff --git a/misc/Makefile b/misc/Makefile
index 1083ba3bfc..3b66cb9f6a 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -73,7 +73,8 @@  routines := brk sbrk sstk ioctl \
 	    fgetxattr flistxattr fremovexattr fsetxattr getxattr \
 	    listxattr lgetxattr llistxattr lremovexattr lsetxattr \
 	    removexattr setxattr getauxval ifunc-impl-list makedev \
-	    allocate_once fd_to_filename single_threaded unwind-link
+	    allocate_once fd_to_filename single_threaded unwind-link \
+	    atomic_wide_counter
 
 generated += tst-error1.mtrace tst-error1-mem.out \
   tst-allocate_once.mtrace tst-allocate_once-mem.out
diff --git a/misc/atomic_wide_counter.c b/misc/atomic_wide_counter.c
new file mode 100644
index 0000000000..56d8981925
--- /dev/null
+++ b/misc/atomic_wide_counter.c
@@ -0,0 +1,127 @@ 
+/* Monotonically increasing wide counters (at least 62 bits).
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <atomic_wide_counter.h>
+
+#if !__HAVE_64B_ATOMICS
+
+/* Values we add or xor are less than or equal to 1<<31, so we only
+   have to make overflow-and-addition atomic wrt. to concurrent load
+   operations and xor operations.  To do that, we split each counter
+   into two 32b values of which we reserve the MSB of each to
+   represent an overflow from the lower-order half to the higher-order
+   half.
+
+   In the common case, the state is (higher-order / lower-order half, and . is
+   basically concatenation of the bits):
+   0.h     / 0.l  = h.l
+
+   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
+   following steps S1-S4 (the values these represent are on the right-hand
+   side):
+   S1:  0.h     / 1.L == (h+1).L
+   S2:  1.(h+1) / 1.L == (h+1).L
+   S3:  1.(h+1) / 0.L == (h+1).L
+   S4:  0.(h+1) / 0.L == (h+1).L
+   If the LSB of the higher-order half is set, readers will ignore the
+   overflow bit in the lower-order half.
+
+   To get an atomic snapshot in load operations, we exploit that the
+   higher-order half is monotonically increasing; if we load a value V from
+   it, then read the lower-order half, and then read the higher-order half
+   again and see the same value V, we know that both halves have existed in
+   the sequence of values the full counter had.  This is similar to the
+   validated reads in the time-based STMs in GCC's libitm (e.g.,
+   method_ml_wt).
+
+   One benefit of this scheme is that this makes load operations
+   obstruction-free because unlike if we would just lock the counter, readers
+   can almost always interpret a snapshot of each halves.  Readers can be
+   forced to read a new snapshot when the read is concurrent with an overflow.
+   However, overflows will happen infrequently, so load operations are
+   practically lock-free.  */
+
+uint64_t
+__atomic_wide_counter_fetch_add_relaxed (__atomic_wide_counter *c,
+                                         unsigned int op)
+{
+  /* S1. Note that this is an atomic read-modify-write so it extends the
+     release sequence of release MO store at S3.  */
+  unsigned int l = atomic_fetch_add_relaxed (&c->__value32.__low, op);
+  unsigned int h = atomic_load_relaxed (&c->__value32.__high);
+  uint64_t result = ((uint64_t) h << 31) | l;
+  l += op;
+  if ((l >> 31) > 0)
+    {
+      /* Overflow.  Need to increment higher-order half.  Note that all
+         add operations are ordered in happens-before.  */
+      h++;
+      /* S2. Release MO to synchronize with the loads of the higher-order half
+         in the load operation.  See __condvar_load_64_relaxed.  */
+      atomic_store_release (&c->__value32.__high,
+                            h | ((unsigned int) 1 << 31));
+      l ^= (unsigned int) 1 << 31;
+      /* S3.  See __condvar_load_64_relaxed.  */
+      atomic_store_release (&c->__value32.__low, l);
+      /* S4.  Likewise.  */
+      atomic_store_release (&c->__value32.__high, h);
+    }
+  return result;
+}
+
+uint64_t
+__atomic_wide_counter_load_relaxed (__atomic_wide_counter *c)
+{
+  unsigned int h, l, h2;
+  do
+    {
+      /* This load and the second one below to the same location read from the
+         stores in the overflow handling of the add operation or the
+         initializing stores (which is a simple special case because
+         initialization always completely happens before further use).
+         Because no two stores to the higher-order half write the same value,
+         the loop ensures that if we continue to use the snapshot, this load
+         and the second one read from the same store operation.  All candidate
+         store operations have release MO.
+         If we read from S2 in the first load, then we will see the value of
+         S1 on the next load (because we synchronize with S2), or a value
+         later in modification order.  We correctly ignore the lower-half's
+         overflow bit in this case.  If we read from S4, then we will see the
+         value of S3 in the next load (or a later value), which does not have
+         the overflow bit set anymore.
+          */
+      h = atomic_load_acquire (&c->__value32.__high);
+      /* This will read from the release sequence of S3 (i.e, either the S3
+         store or the read-modify-writes at S1 following S3 in modification
+         order).  Thus, the read synchronizes with S3, and the following load
+         of the higher-order half will read from the matching S2 (or a later
+         value).
+         Thus, if we read a lower-half value here that already overflowed and
+         belongs to an increased higher-order half value, we will see the
+         latter and h and h2 will not be equal.  */
+      l = atomic_load_acquire (&c->__value32.__low);
+      /* See above.  */
+      h2 = atomic_load_relaxed (&c->__value32.__high);
+    }
+  while (h != h2);
+  if (((l >> 31) > 0) && ((h >> 31) > 0))
+    l ^= (unsigned int) 1 << 31;
+  return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l;
+}
+
+#endif /* !__HAVE_64B_ATOMICS */
diff --git a/nptl/Makefile b/nptl/Makefile
index ff4d590f11..6310aecaad 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -22,8 +22,14 @@  subdir	:= nptl
 
 include ../Makeconfig
 
-headers := pthread.h semaphore.h bits/semaphore.h \
-	   bits/struct_mutex.h bits/struct_rwlock.h
+headers := \
+  bits/atomic_wide_counter.h \
+  bits/semaphore.h \
+  bits/struct_mutex.h \
+  bits/struct_rwlock.h \
+  pthread.h \
+  semaphore.h \
+  # headers
 
 extra-libs := libpthread
 extra-libs-others := $(extra-libs)
@@ -270,7 +276,7 @@  tests = tst-attr2 tst-attr3 tst-default-attr \
 	tst-mutexpi1 tst-mutexpi2 tst-mutexpi3 tst-mutexpi4 \
 	tst-mutexpi5 tst-mutexpi5a tst-mutexpi6 tst-mutexpi7 tst-mutexpi7a \
 	tst-mutexpi9 tst-mutexpi10 \
-	tst-cond22 tst-cond26 \
+	tst-cond26 \
 	tst-robustpi1 tst-robustpi2 tst-robustpi3 tst-robustpi4 tst-robustpi5 \
 	tst-robustpi6 tst-robustpi7 tst-robustpi9 \
 	tst-rwlock2 tst-rwlock2a tst-rwlock2b tst-rwlock3 \
@@ -319,6 +325,7 @@  tests-internal := tst-robustpi8 tst-rwlock19 tst-rwlock20 \
 		  tst-barrier5 tst-signal7 tst-mutex8 tst-mutex8-static \
 		  tst-mutexpi8 tst-mutexpi8-static \
 		  tst-setgetname \
+		  tst-cond22 \
 
 xtests = tst-setuid1 tst-setuid1-static tst-setuid2 \
 	tst-mutexpp1 tst-mutexpp6 tst-mutexpp10 tst-setgroups \
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index c35b9ef03a..fb56f93b6e 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -17,79 +17,52 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <atomic.h>
+#include <atomic_wide_counter.h>
 #include <stdint.h>
 #include <pthread.h>
 
-/* We need 3 least-significant bits on __wrefs for something else.  */
+/* We need 3 least-significant bits on __wrefs for something else.
+   This also matches __atomic_wide_counter requirements: The highest
+   value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to __g1_start
+   (the two extra bits are for the lock in the two LSBs of
+   __g1_start).  */
 #define __PTHREAD_COND_MAX_GROUP_SIZE ((unsigned) 1 << 29)
 
-#if __HAVE_64B_ATOMICS == 1
-
-static uint64_t __attribute__ ((unused))
+static inline uint64_t
 __condvar_load_wseq_relaxed (pthread_cond_t *cond)
 {
-  return atomic_load_relaxed (&cond->__data.__wseq);
+  return __atomic_wide_counter_load_relaxed (&cond->__data.__wseq);
 }
 
-static uint64_t __attribute__ ((unused))
+static inline uint64_t
 __condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
 {
-  return atomic_fetch_add_acquire (&cond->__data.__wseq, val);
+  return __atomic_wide_counter_fetch_add_acquire (&cond->__data.__wseq, val);
 }
 
-static uint64_t __attribute__ ((unused))
-__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
+static inline uint64_t
+__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
 {
-  return atomic_fetch_xor_release (&cond->__data.__wseq, val);
+  return __atomic_wide_counter_load_relaxed (&cond->__data.__g1_start);
 }
 
-static uint64_t __attribute__ ((unused))
-__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
+static inline void
+__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
 {
-  return atomic_load_relaxed (&cond->__data.__g1_start);
+  __atomic_wide_counter_add_relaxed (&cond->__data.__g1_start, val);
 }
 
-static void __attribute__ ((unused))
-__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
+#if __HAVE_64B_ATOMICS == 1
+
+static inline uint64_t
+__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
 {
-  atomic_store_relaxed (&cond->__data.__g1_start,
-      atomic_load_relaxed (&cond->__data.__g1_start) + val);
+  return atomic_fetch_xor_release (&cond->__data.__wseq.__value64, val);
 }
 
-#else
-
-/* We use two 64b counters: __wseq and __g1_start.  They are monotonically
-   increasing and single-writer-multiple-readers counters, so we can implement
-   load, fetch-and-add, and fetch-and-xor operations even when we just have
-   32b atomics.  Values we add or xor are less than or equal to 1<<31 (*),
-   so we only have to make overflow-and-addition atomic wrt. to concurrent
-   load operations and xor operations.  To do that, we split each counter into
-   two 32b values of which we reserve the MSB of each to represent an
-   overflow from the lower-order half to the higher-order half.
-
-   In the common case, the state is (higher-order / lower-order half, and . is
-   basically concatenation of the bits):
-   0.h     / 0.l  = h.l
-
-   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
-   following steps S1-S4 (the values these represent are on the right-hand
-   side):
-   S1:  0.h     / 1.L == (h+1).L
-   S2:  1.(h+1) / 1.L == (h+1).L
-   S3:  1.(h+1) / 0.L == (h+1).L
-   S4:  0.(h+1) / 0.L == (h+1).L
-   If the LSB of the higher-order half is set, readers will ignore the
-   overflow bit in the lower-order half.
-
-   To get an atomic snapshot in load operations, we exploit that the
-   higher-order half is monotonically increasing; if we load a value V from
-   it, then read the lower-order half, and then read the higher-order half
-   again and see the same value V, we know that both halves have existed in
-   the sequence of values the full counter had.  This is similar to the
-   validated reads in the time-based STMs in GCC's libitm (e.g.,
-   method_ml_wt).
-
-   The xor operation needs to be an atomic read-modify-write.  The write
+#else /* !__HAVE_64B_ATOMICS */
+
+/* The xor operation needs to be an atomic read-modify-write.  The write
    itself is not an issue as it affects just the lower-order half but not bits
    used in the add operation.  To make the full fetch-and-xor atomic, we
    exploit that concurrently, the value can increase by at most 1<<31 (*): The
@@ -97,117 +70,18 @@  __condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
    than __PTHREAD_COND_MAX_GROUP_SIZE waiters can enter concurrently and thus
    increment __wseq.  Therefore, if the xor operation observes a value of
    __wseq, then the value it applies the modification to later on can be
-   derived (see below).
-
-   One benefit of this scheme is that this makes load operations
-   obstruction-free because unlike if we would just lock the counter, readers
-   can almost always interpret a snapshot of each halves.  Readers can be
-   forced to read a new snapshot when the read is concurrent with an overflow.
-   However, overflows will happen infrequently, so load operations are
-   practically lock-free.
-
-   (*) The highest value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to
-   __g1_start (the two extra bits are for the lock in the two LSBs of
-   __g1_start).  */
-
-typedef struct
-{
-  unsigned int low;
-  unsigned int high;
-} _condvar_lohi;
-
-static uint64_t
-__condvar_fetch_add_64_relaxed (_condvar_lohi *lh, unsigned int op)
-{
-  /* S1. Note that this is an atomic read-modify-write so it extends the
-     release sequence of release MO store at S3.  */
-  unsigned int l = atomic_fetch_add_relaxed (&lh->low, op);
-  unsigned int h = atomic_load_relaxed (&lh->high);
-  uint64_t result = ((uint64_t) h << 31) | l;
-  l += op;
-  if ((l >> 31) > 0)
-    {
-      /* Overflow.  Need to increment higher-order half.  Note that all
-	 add operations are ordered in happens-before.  */
-      h++;
-      /* S2. Release MO to synchronize with the loads of the higher-order half
-	 in the load operation.  See __condvar_load_64_relaxed.  */
-      atomic_store_release (&lh->high, h | ((unsigned int) 1 << 31));
-      l ^= (unsigned int) 1 << 31;
-      /* S3.  See __condvar_load_64_relaxed.  */
-      atomic_store_release (&lh->low, l);
-      /* S4.  Likewise.  */
-      atomic_store_release (&lh->high, h);
-    }
-  return result;
-}
-
-static uint64_t
-__condvar_load_64_relaxed (_condvar_lohi *lh)
-{
-  unsigned int h, l, h2;
-  do
-    {
-      /* This load and the second one below to the same location read from the
-	 stores in the overflow handling of the add operation or the
-	 initializing stores (which is a simple special case because
-	 initialization always completely happens before further use).
-	 Because no two stores to the higher-order half write the same value,
-	 the loop ensures that if we continue to use the snapshot, this load
-	 and the second one read from the same store operation.  All candidate
-	 store operations have release MO.
-	 If we read from S2 in the first load, then we will see the value of
-	 S1 on the next load (because we synchronize with S2), or a value
-	 later in modification order.  We correctly ignore the lower-half's
-	 overflow bit in this case.  If we read from S4, then we will see the
-	 value of S3 in the next load (or a later value), which does not have
-	 the overflow bit set anymore.
-	  */
-      h = atomic_load_acquire (&lh->high);
-      /* This will read from the release sequence of S3 (i.e, either the S3
-	 store or the read-modify-writes at S1 following S3 in modification
-	 order).  Thus, the read synchronizes with S3, and the following load
-	 of the higher-order half will read from the matching S2 (or a later
-	 value).
-	 Thus, if we read a lower-half value here that already overflowed and
-	 belongs to an increased higher-order half value, we will see the
-	 latter and h and h2 will not be equal.  */
-      l = atomic_load_acquire (&lh->low);
-      /* See above.  */
-      h2 = atomic_load_relaxed (&lh->high);
-    }
-  while (h != h2);
-  if (((l >> 31) > 0) && ((h >> 31) > 0))
-    l ^= (unsigned int) 1 << 31;
-  return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l;
-}
-
-static uint64_t __attribute__ ((unused))
-__condvar_load_wseq_relaxed (pthread_cond_t *cond)
-{
-  return __condvar_load_64_relaxed ((_condvar_lohi *) &cond->__data.__wseq32);
-}
-
-static uint64_t __attribute__ ((unused))
-__condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
-{
-  uint64_t r = __condvar_fetch_add_64_relaxed
-      ((_condvar_lohi *) &cond->__data.__wseq32, val);
-  atomic_thread_fence_acquire ();
-  return r;
-}
+   derived.  */
 
 static uint64_t __attribute__ ((unused))
 __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
 {
-  _condvar_lohi *lh = (_condvar_lohi *) &cond->__data.__wseq32;
   /* First, get the current value.  See __condvar_load_64_relaxed.  */
   unsigned int h, l, h2;
   do
     {
-      h = atomic_load_acquire (&lh->high);
-      l = atomic_load_acquire (&lh->low);
-      h2 = atomic_load_relaxed (&lh->high);
+      h = atomic_load_acquire (&cond->__data.__wseq.__value32.__high);
+      l = atomic_load_acquire (&cond->__data.__wseq.__value32.__low);
+      h2 = atomic_load_relaxed (&cond->__data.__wseq.__value32.__high);
     }
   while (h != h2);
   if (((l >> 31) > 0) && ((h >> 31) == 0))
@@ -219,8 +93,9 @@  __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
      earlier in modification order than the following fetch-xor.
      This uses release MO to make the full operation have release semantics
      (all other operations access the lower-order half).  */
-  unsigned int l2 = atomic_fetch_xor_release (&lh->low, val)
-      & ~((unsigned int) 1 << 31);
+  unsigned int l2
+    = (atomic_fetch_xor_release (&cond->__data.__wseq.__value32.__low, val)
+       & ~((unsigned int) 1 << 31));
   if (l2 < l)
     /* The lower-order half overflowed in the meantime.  This happened exactly
        once due to the limit on concurrent waiters (see above).  */
@@ -228,22 +103,7 @@  __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
   return ((uint64_t) h << 31) + l2;
 }
 
-static uint64_t __attribute__ ((unused))
-__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
-{
-  return __condvar_load_64_relaxed
-      ((_condvar_lohi *) &cond->__data.__g1_start32);
-}
-
-static void __attribute__ ((unused))
-__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
-{
-  ignore_value (__condvar_fetch_add_64_relaxed
-      ((_condvar_lohi *) &cond->__data.__g1_start32, val));
-}
-
-#endif  /* !__HAVE_64B_ATOMICS  */
-
+#endif /* !__HAVE_64B_ATOMICS */
 
 /* The lock that signalers use.  See pthread_cond_wait_common for uses.
    The lock is our normal three-state lock: not acquired (0) / acquired (1) /
diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
index 64f19ea0a5..1336e9c79d 100644
--- a/nptl/tst-cond22.c
+++ b/nptl/tst-cond22.c
@@ -106,8 +106,11 @@  do_test (void)
       status = 1;
     }
 
-  printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
-	  c.__data.__wseq, c.__data.__g1_start,
+  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+	  c.__data.__wseq.__value32.__high,
+	  c.__data.__wseq.__value32.__low,
+	  c.__data.__g1_start.__value32.__high,
+	  c.__data.__g1_start.__value32.__low,
 	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
 	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
 	  c.__data.__g1_orig_size, c.__data.__wrefs);
@@ -149,8 +152,11 @@  do_test (void)
       status = 1;
     }
 
-  printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
-	  c.__data.__wseq, c.__data.__g1_start,
+  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+	  c.__data.__wseq.__value32.__high,
+	  c.__data.__wseq.__value32.__low,
+	  c.__data.__g1_start.__value32.__high,
+	  c.__data.__g1_start.__value32.__low,
 	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
 	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
 	  c.__data.__g1_orig_size, c.__data.__wrefs);
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
index 44bf1e358d..b82a79a43e 100644
--- a/sysdeps/nptl/bits/thread-shared-types.h
+++ b/sysdeps/nptl/bits/thread-shared-types.h
@@ -43,6 +43,8 @@ 
 
 #include <bits/pthreadtypes-arch.h>
 
+#include <bits/atomic_wide_counter.h>
+
 
 /* Common definition of pthread_mutex_t. */
 
@@ -91,24 +93,8 @@  typedef struct __pthread_internal_slist
 
 struct __pthread_cond_s
 {
-  __extension__ union
-  {
-    __extension__ unsigned long long int __wseq;
-    struct
-    {
-      unsigned int __low;
-      unsigned int __high;
-    } __wseq32;
-  };
-  __extension__ union
-  {
-    __extension__ unsigned long long int __g1_start;
-    struct
-    {
-      unsigned int __low;
-      unsigned int __high;
-    } __g1_start32;
-  };
+  __atomic_wide_counter __wseq;
+  __atomic_wide_counter __g1_start;
   unsigned int __g_refs[2] __LOCK_ALIGNMENT;
   unsigned int __g_size[2];
   unsigned int __g1_orig_size;