[3/7] Remove atomic_bit_set/bit_test_set

Message ID AM5PR0801MB1668D792891F7663B534D31883809@AM5PR0801MB1668.eurprd08.prod.outlook.com
State Dropped
Headers
Series [1/7] Use atomic_exchange_release/acquire |

Checks

Context Check Description
dj/TryBot-apply_patch fail Patch failed to apply to master at the time it was sent

Commit Message

Wilco Dijkstra July 6, 2022, 3:14 p.m. UTC
  Replace the 3 uses of atomic_bit_set and atomic_bit_test_set with atomic_fetch_or_acquire.
This enables removal of a lot of target specific implementations.

---
  

Comments

Noah Goldstein July 6, 2022, 4:14 p.m. UTC | #1
On Wed, Jul 6, 2022 at 8:14 AM Wilco Dijkstra via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Replace the 3 uses of atomic_bit_set and atomic_bit_test_set with atomic_fetch_or_acquire.
> This enables removal of a lot of target specific implementations.
>
> ---
>
> diff --git a/include/atomic.h b/include/atomic.h
> index 73cc772f0149f94fa0c3e14fa858fa89fee6985f..ed1fc38e7569fcbdd0473ab6f69956a44d62354f 100644
> --- a/include/atomic.h
> +++ b/include/atomic.h
> @@ -255,28 +255,6 @@
>  #endif
>
>
> -#ifndef atomic_bit_set
> -# define atomic_bit_set(mem, bit) \
> -  (void) atomic_bit_test_set(mem, bit)
> -#endif
> -
> -
> -#ifndef atomic_bit_test_set
> -# define atomic_bit_test_set(mem, bit) \
> -  ({ __typeof (*(mem)) __atg14_old;                                          \
> -     __typeof (mem) __atg14_memp = (mem);                                    \
> -     __typeof (*(mem)) __atg14_mask = ((__typeof (*(mem))) 1 << (bit));              \
> -                                                                             \
> -     do                                                                              \
> -       __atg14_old = (*__atg14_memp);                                        \
> -     while (__builtin_expect                                                 \
> -           (atomic_compare_and_exchange_bool_acq (__atg14_memp,              \
> -                                                  __atg14_old | __atg14_mask,\
> -                                                  __atg14_old), 0));         \
> -                                                                             \
> -     __atg14_old & __atg14_mask; })
> -#endif
> -
>  /* Atomically *mem &= mask.  */
>  #ifndef atomic_and
>  # define atomic_and(mem, mask) \
> diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
> index 13acf0da8f1ab41774175401a68ce28a0eb7ed30..765b036873e181d6c49299c0ad08d5dbfcc05387 100644
> --- a/misc/tst-atomic.c
> +++ b/misc/tst-atomic.c
> @@ -325,66 +325,6 @@ do_test (void)
>        ret = 1;
>      }
>
> -  mem = 0;
> -  atomic_bit_set (&mem, 1);
> -  if (mem != 2)
> -    {
> -      puts ("atomic_bit_set test 1 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 8;
> -  atomic_bit_set (&mem, 3);
> -  if (mem != 8)
> -    {
> -      puts ("atomic_bit_set test 2 failed");
> -      ret = 1;
> -    }
> -
> -#ifdef TEST_ATOMIC64
> -  mem = 16;
> -  atomic_bit_set (&mem, 35);
> -  if (mem != 0x800000010LL)
> -    {
> -      puts ("atomic_bit_set test 3 failed");
> -      ret = 1;
> -    }
> -#endif
> -
> -  mem = 0;
> -  if (atomic_bit_test_set (&mem, 1)
> -      || mem != 2)
> -    {
> -      puts ("atomic_bit_test_set test 1 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 8;
> -  if (! atomic_bit_test_set (&mem, 3)
> -      || mem != 8)
> -    {
> -      puts ("atomic_bit_test_set test 2 failed");
> -      ret = 1;
> -    }
> -
> -#ifdef TEST_ATOMIC64
> -  mem = 16;
> -  if (atomic_bit_test_set (&mem, 35)
> -      || mem != 0x800000010LL)
> -    {
> -      puts ("atomic_bit_test_set test 3 failed");
> -      ret = 1;
> -    }
> -
> -  mem = 0x100000000LL;
> -  if (! atomic_bit_test_set (&mem, 32)
> -      || mem != 0x100000000LL)
> -    {
> -      puts ("atomic_bit_test_set test 4 failed");
> -      ret = 1;
> -    }
> -#endif
> -
>    /* Tests for C11-like atomics.  */
>    mem = 11;
>    if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
> diff --git a/nptl/nptl_free_tcb.c b/nptl/nptl_free_tcb.c
> index 0d31143ac849de6398e06c399b94813ae57dcff3..b63ec3bc3c1df4a1fc5272a24d453e679dbedd5e 100644
> --- a/nptl/nptl_free_tcb.c
> +++ b/nptl/nptl_free_tcb.c
> @@ -24,7 +24,8 @@ void
>  __nptl_free_tcb (struct pthread *pd)
>  {
>    /* The thread is exiting now.  */
> -  if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> +  if ((atomic_fetch_or_acquire (&pd->cancelhandling, 1 << TERMINATED_BIT)
> +      & (1 << TERMINATED_BIT)) == 0)

I think this change may cause regressions on x86 because at the moment
it seems the only way to generate `lock bts` is with inline assembly:

https://godbolt.org/z/bdchE53Ps

Can we keep atomic bit_test_set?
>      {
>        /* Free TPP data.  */
>        if (pd->tpp != NULL)
> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 870a8fcb34eb43b58c2260fee6a4624f0fbbd469..4b7f3edc384748f300ca935ad878eb0e3547e163 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -487,7 +487,7 @@ start_thread (void *arg)
>    /* The thread is exiting now.  Don't set this bit until after we've hit
>       the event-reporting breakpoint, so that td_thr_get_info on us while at
>       the breakpoint reports TD_THR_RUN state rather than TD_THR_ZOMBIE.  */
> -  atomic_bit_set (&pd->cancelhandling, EXITING_BIT);
> +  atomic_fetch_or_acquire (&pd->cancelhandling, 1 << EXITING_BIT);
>
>    if (__glibc_unlikely (atomic_decrement_and_test (&__nptl_nthreads)))
>      /* This was the last thread.  */
> diff --git a/sysdeps/alpha/atomic-machine.h b/sysdeps/alpha/atomic-machine.h
> index df28b90261aa485125d951bc1abf76602730bfd0..115a9df5d77cd08bcb0a49d9b59f0c53b4a20d78 100644
> --- a/sysdeps/alpha/atomic-machine.h
> +++ b/sysdeps/alpha/atomic-machine.h
> @@ -325,15 +325,6 @@
>  #define atomic_exchange_and_add(mem, value) \
>    __atomic_val_bysize (__arch_exchange_and_add, int, mem, value, __MB, __MB)
>
> -
> -/* ??? Blah, I'm lazy.  Implement these later.  Can do better than the
> -   compare-and-exchange loop provided by generic code.
> -
> -#define atomic_decrement_if_positive(mem)
> -#define atomic_bit_test_set(mem, bit)
> -
> -*/
> -
>  #define atomic_full_barrier()  __asm ("mb" : : : "memory");
>  #define atomic_read_barrier()  __asm ("mb" : : : "memory");
>  #define atomic_write_barrier() __asm ("wmb" : : : "memory");
> diff --git a/sysdeps/ia64/atomic-machine.h b/sysdeps/ia64/atomic-machine.h
> index 71afcfe0d0031a6ceae5879a2b3b19ed2ee2f110..b2f5d2f4774cc2503c7595cb82f30f60fbcbe89c 100644
> --- a/sysdeps/ia64/atomic-machine.h
> +++ b/sysdeps/ia64/atomic-machine.h
> @@ -77,20 +77,4 @@
>       while (__builtin_expect (__val != __oldval, 0));                        \
>       __oldval; })
>
> -#define atomic_bit_test_set(mem, bit) \
> -  ({ __typeof (*mem) __oldval, __val;                                        \
> -     __typeof (mem) __memp = (mem);                                          \
> -     __typeof (*mem) __mask = ((__typeof (*mem)) 1 << (bit));                \
> -                                                                             \
> -     __val = (*__memp);                                                              \
> -     do                                                                              \
> -       {                                                                     \
> -        __oldval = __val;                                                    \
> -        __val = atomic_compare_and_exchange_val_acq (__memp,                 \
> -                                                     __oldval | __mask,      \
> -                                                     __oldval);              \
> -       }                                                                     \
> -     while (__builtin_expect (__val != __oldval, 0));                        \
> -     __oldval & __mask; })
> -
>  #define atomic_full_barrier() __sync_synchronize ()
> diff --git a/sysdeps/m68k/m680x0/m68020/atomic-machine.h b/sysdeps/m68k/m680x0/m68020/atomic-machine.h
> index 70cda007341b49d28dc1d9847a88198b47589db4..72a3b81642c1a23207054092d16bb856556cd897 100644
> --- a/sysdeps/m68k/m680x0/m68020/atomic-machine.h
> +++ b/sysdeps/m68k/m680x0/m68020/atomic-machine.h
> @@ -218,15 +218,3 @@
>                            : "memory");                                       \
>         }                                                                     \
>       __result; })
> -
> -#define atomic_bit_set(mem, bit) \
> -  __asm __volatile ("bfset %0{%1,#1}"                                        \
> -                   : "+m" (*(mem))                                           \
> -                   : "di" (sizeof (*(mem)) * 8 - (bit) - 1))
> -
> -#define atomic_bit_test_set(mem, bit) \
> -  ({ char __result;                                                          \
> -     __asm __volatile ("bfset %1{%2,#1}; sne %0"                             \
> -                      : "=dm" (__result), "+m" (*(mem))                      \
> -                      : "di" (sizeof (*(mem)) * 8 - (bit) - 1));             \
> -     __result; })
> diff --git a/sysdeps/nptl/pthreadP.h b/sysdeps/nptl/pthreadP.h
> index 39af275c254ef3e737736bd5c38099bada8746d6..e8bd4c6dcbf7b7785857ff08c15dac9a5cfb2a5d 100644
> --- a/sysdeps/nptl/pthreadP.h
> +++ b/sysdeps/nptl/pthreadP.h
> @@ -43,12 +43,6 @@
>    atomic_compare_and_exchange_val_acq (&(descr)->member, new, old)
>  #endif
>
> -#ifndef THREAD_ATOMIC_BIT_SET
> -# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
> -  atomic_bit_set (&(descr)->member, bit)
> -#endif
> -
> -
>  static inline short max_adaptive_count (void)
>  {
>  #if HAVE_TUNABLES
> @@ -276,7 +270,7 @@ __do_cancel (void)
>    struct pthread *self = THREAD_SELF;
>
>    /* Make sure we get no more cancellations.  */
> -  atomic_bit_set (&self->cancelhandling, EXITING_BIT);
> +  atomic_fetch_or_acquire (&self->cancelhandling, 1 << EXITING_BIT);
>
>    __pthread_unwind ((__pthread_unwind_buf_t *)
>                     THREAD_GETMEM (self, cleanup_jmp_buf));
> diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
> index fe7f04cf30cf5b62fea743a72384f8fc248d2d25..db31e377970c4ab6285ef65fb21419db7c6ca373 100644
> --- a/sysdeps/s390/atomic-machine.h
> +++ b/sysdeps/s390/atomic-machine.h
> @@ -93,17 +93,6 @@
>      atomic_or_val (mem, mask);                 \
>    } while (0)
>
> -/* Atomically *mem |= 1 << bit and return true if the bit was set in old value
> -   of *mem.  */
> -/* The load-and-or instruction is used on z196 zarch and higher cpus
> -   instead of a loop with compare-and-swap instruction.  */
> -#define atomic_bit_test_set(mem, bit)                                  \
> -  ({ __typeof (*(mem)) __atg14_old;                                    \
> -    __typeof (mem) __atg14_memp = (mem);                               \
> -    __typeof (*(mem)) __atg14_mask = ((__typeof (*(mem))) 1 << (bit)); \
> -    __atg14_old = atomic_or_val (__atg14_memp, __atg14_mask);          \
> -    __atg14_old & __atg14_mask; })
> -
>  /* Atomically *mem &= mask and return the old value of *mem.  */
>  /* The gcc builtin uses load-and-and instruction on z196 zarch and higher cpus
>     instead of a loop with compare-and-swap instruction.  */
> diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> index deb6cadaa35e841e23cee55d169a20538bdb1f8d..c5eb5c639fb59d7395c0a2d8f4fd72452845914b 100644
> --- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> +++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> @@ -159,10 +159,6 @@
>
>  # define atomic_max_relaxed(mem, value) asm_amo ("amomaxu", "", mem, value)
>
> -# define atomic_bit_test_set(mem, bit)                   \
> -  ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
> -     asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
> -
>  #else /* __riscv_atomic */
>  # error "ISAs that do not subsume the A extension are not supported"
>  #endif /* !__riscv_atomic */
> diff --git a/sysdeps/unix/sysv/linux/sh/atomic-machine.h b/sysdeps/unix/sysv/linux/sh/atomic-machine.h
> index aead3d5866d12b3b6bba27a1ef4a273400e1715b..3bf84d06bc875fca75b38b5ab01fca6683d390f7 100644
> --- a/sysdeps/unix/sysv/linux/sh/atomic-machine.h
> +++ b/sysdeps/unix/sysv/linux/sh/atomic-machine.h
> @@ -301,97 +301,3 @@
>
>  #define atomic_increment_and_test(mem) atomic_add_zero((mem), 1)
>  #define atomic_decrement_and_test(mem) atomic_add_zero((mem), -1)
> -
> -#define atomic_bit_set(mem, bit) \
> -  (void) ({ unsigned int __mask = 1 << (bit); \
> -           if (sizeof (*(mem)) == 1) \
> -             __asm __volatile ("\
> -               mova 1f,r0\n\
> -               mov r15,r1\n\
> -               .align 2\n\
> -               mov #(0f-1f),r15\n\
> -            0: mov.b @%0,r2\n\
> -               or %1,r2\n\
> -               mov.b r2,@%0\n\
> -            1: mov r1,r15"\
> -               : : "u" (mem), "u" (__mask) \
> -               : "r0", "r1", "r2", "memory"); \
> -           else if (sizeof (*(mem)) == 2) \
> -             __asm __volatile ("\
> -               mova 1f,r0\n\
> -               mov r15,r1\n\
> -               .align 2\n\
> -               mov #(0f-1f),r15\n\
> -            0: mov.w @%0,r2\n\
> -               or %1,r2\n\
> -               mov.w r2,@%0\n\
> -            1: mov r1,r15"\
> -               : : "u" (mem), "u" (__mask) \
> -               : "r0", "r1", "r2", "memory"); \
> -           else if (sizeof (*(mem)) == 4) \
> -             __asm __volatile ("\
> -               mova 1f,r0\n\
> -               mov r15,r1\n\
> -               .align 2\n\
> -               mov #(0f-1f),r15\n\
> -            0: mov.l @%0,r2\n\
> -               or %1,r2\n\
> -               mov.l r2,@%0\n\
> -            1: mov r1,r15"\
> -               : : "u" (mem), "u" (__mask) \
> -               : "r0", "r1", "r2", "memory"); \
> -           else \
> -             abort (); \
> -           })
> -
> -#define atomic_bit_test_set(mem, bit) \
> -  ({ unsigned int __mask = 1 << (bit); \
> -     unsigned int __result = __mask; \
> -     if (sizeof (*(mem)) == 1) \
> -       __asm __volatile ("\
> -         mova 1f,r0\n\
> -         .align 2\n\
> -         mov r15,r1\n\
> -         mov #(0f-1f),r15\n\
> -       0: mov.b @%2,r2\n\
> -         mov r2,r3\n\
> -         or %1,r2\n\
> -         mov.b r2,@%2\n\
> -       1: mov r1,r15\n\
> -         and r3,%0"\
> -       : "=&r" (__result), "=&r" (__mask) \
> -       : "u" (mem), "0" (__result), "1" (__mask) \
> -       : "r0", "r1", "r2", "r3", "memory");    \
> -     else if (sizeof (*(mem)) == 2) \
> -       __asm __volatile ("\
> -         mova 1f,r0\n\
> -         .align 2\n\
> -         mov r15,r1\n\
> -         mov #(0f-1f),r15\n\
> -       0: mov.w @%2,r2\n\
> -         mov r2,r3\n\
> -         or %1,r2\n\
> -         mov.w %1,@%2\n\
> -       1: mov r1,r15\n\
> -         and r3,%0"\
> -       : "=&r" (__result), "=&r" (__mask) \
> -       : "u" (mem), "0" (__result), "1" (__mask) \
> -       : "r0", "r1", "r2", "r3", "memory"); \
> -     else if (sizeof (*(mem)) == 4) \
> -       __asm __volatile ("\
> -         mova 1f,r0\n\
> -         .align 2\n\
> -         mov r15,r1\n\
> -         mov #(0f-1f),r15\n\
> -       0: mov.l @%2,r2\n\
> -         mov r2,r3\n\
> -         or r2,%1\n\
> -         mov.l %1,@%2\n\
> -       1: mov r1,r15\n\
> -         and r3,%0"\
> -       : "=&r" (__result), "=&r" (__mask) \
> -       : "u" (mem), "0" (__result), "1" (__mask) \
> -       : "r0", "r1", "r2", "r3", "memory"); \
> -     else \
> -       abort (); \
> -     __result; })
> diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
> index c27a437ec1b129fc18ca832708a69627959fc107..6adc219bf69f1507624618ed931dad11fea4e150 100644
> --- a/sysdeps/x86/atomic-machine.h
> +++ b/sysdeps/x86/atomic-machine.h
> @@ -292,56 +292,6 @@
>       __result; })
>
>
> -#define atomic_bit_set(mem, bit) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (LOCK_PREFIX "orb %b2, %0"                            \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem), IBR_CONSTRAINT (1L << (bit)));          \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (LOCK_PREFIX "orw %w2, %0"                            \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem), "ir" (1L << (bit)));                    \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (LOCK_PREFIX "orl %2, %0"                             \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem), "ir" (1L << (bit)));                    \
> -    else if (__builtin_constant_p (bit) && (bit) < 32)                       \
> -      __asm __volatile (LOCK_PREFIX "orq %2, %0"                             \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem), "i" (1L << (bit)));                     \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (LOCK_PREFIX "orq %q2, %0"                            \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem), "r" (1UL << (bit)));                    \
> -    else                                                                     \
> -      __atomic_link_error ();                                                \
> -  } while (0)
> -
> -
> -#define atomic_bit_test_set(mem, bit) \
> -  ({ unsigned char __result;                                                 \
> -     if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0"                  \
> -                        : "=q" (__result), "=m" (*mem)                       \
> -                        : "m" (*mem), IBR_CONSTRAINT (bit));                 \
> -     else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0"                  \
> -                        : "=q" (__result), "=m" (*mem)                       \
> -                        : "m" (*mem), "ir" (bit));                           \
> -     else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"                  \
> -                        : "=q" (__result), "=m" (*mem)                       \
> -                        : "m" (*mem), "ir" (bit));                           \
> -     else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (LOCK_PREFIX "btsq %3, %1; setc %0"                  \
> -                        : "=q" (__result), "=m" (*mem)                       \
> -                        : "m" (*mem), "ir" (bit));                           \
> -     else                                                                    \
> -       __atomic_link_error ();                                       \
> -     __result; })
> -
> -
>  #define __arch_and_body(lock, mem, mask) \
>    do {                                                                       \
>      if (sizeof (*mem) == 1)                                                  \
  
Noah Goldstein July 6, 2022, 6:25 p.m. UTC | #2
On Wed, Jul 6, 2022 at 9:14 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Jul 6, 2022 at 8:14 AM Wilco Dijkstra via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Replace the 3 uses of atomic_bit_set and atomic_bit_test_set with atomic_fetch_or_acquire.
> > This enables removal of a lot of target specific implementations.
> >
> > ---
> >
> > diff --git a/include/atomic.h b/include/atomic.h
> > index 73cc772f0149f94fa0c3e14fa858fa89fee6985f..ed1fc38e7569fcbdd0473ab6f69956a44d62354f 100644
> > --- a/include/atomic.h
> > +++ b/include/atomic.h
> > @@ -255,28 +255,6 @@
> >  #endif
> >
> >
> > -#ifndef atomic_bit_set
> > -# define atomic_bit_set(mem, bit) \
> > -  (void) atomic_bit_test_set(mem, bit)
> > -#endif
> > -
> > -
> > -#ifndef atomic_bit_test_set
> > -# define atomic_bit_test_set(mem, bit) \
> > -  ({ __typeof (*(mem)) __atg14_old;                                          \
> > -     __typeof (mem) __atg14_memp = (mem);                                    \
> > -     __typeof (*(mem)) __atg14_mask = ((__typeof (*(mem))) 1 << (bit));              \
> > -                                                                             \
> > -     do                                                                              \
> > -       __atg14_old = (*__atg14_memp);                                        \
> > -     while (__builtin_expect                                                 \
> > -           (atomic_compare_and_exchange_bool_acq (__atg14_memp,              \
> > -                                                  __atg14_old | __atg14_mask,\
> > -                                                  __atg14_old), 0));         \
> > -                                                                             \
> > -     __atg14_old & __atg14_mask; })
> > -#endif
> > -
> >  /* Atomically *mem &= mask.  */
> >  #ifndef atomic_and
> >  # define atomic_and(mem, mask) \
> > diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
> > index 13acf0da8f1ab41774175401a68ce28a0eb7ed30..765b036873e181d6c49299c0ad08d5dbfcc05387 100644
> > --- a/misc/tst-atomic.c
> > +++ b/misc/tst-atomic.c
> > @@ -325,66 +325,6 @@ do_test (void)
> >        ret = 1;
> >      }
> >
> > -  mem = 0;
> > -  atomic_bit_set (&mem, 1);
> > -  if (mem != 2)
> > -    {
> > -      puts ("atomic_bit_set test 1 failed");
> > -      ret = 1;
> > -    }
> > -
> > -  mem = 8;
> > -  atomic_bit_set (&mem, 3);
> > -  if (mem != 8)
> > -    {
> > -      puts ("atomic_bit_set test 2 failed");
> > -      ret = 1;
> > -    }
> > -
> > -#ifdef TEST_ATOMIC64
> > -  mem = 16;
> > -  atomic_bit_set (&mem, 35);
> > -  if (mem != 0x800000010LL)
> > -    {
> > -      puts ("atomic_bit_set test 3 failed");
> > -      ret = 1;
> > -    }
> > -#endif
> > -
> > -  mem = 0;
> > -  if (atomic_bit_test_set (&mem, 1)
> > -      || mem != 2)
> > -    {
> > -      puts ("atomic_bit_test_set test 1 failed");
> > -      ret = 1;
> > -    }
> > -
> > -  mem = 8;
> > -  if (! atomic_bit_test_set (&mem, 3)
> > -      || mem != 8)
> > -    {
> > -      puts ("atomic_bit_test_set test 2 failed");
> > -      ret = 1;
> > -    }
> > -
> > -#ifdef TEST_ATOMIC64
> > -  mem = 16;
> > -  if (atomic_bit_test_set (&mem, 35)
> > -      || mem != 0x800000010LL)
> > -    {
> > -      puts ("atomic_bit_test_set test 3 failed");
> > -      ret = 1;
> > -    }
> > -
> > -  mem = 0x100000000LL;
> > -  if (! atomic_bit_test_set (&mem, 32)
> > -      || mem != 0x100000000LL)
> > -    {
> > -      puts ("atomic_bit_test_set test 4 failed");
> > -      ret = 1;
> > -    }
> > -#endif
> > -
> >    /* Tests for C11-like atomics.  */
> >    mem = 11;
> >    if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
> > diff --git a/nptl/nptl_free_tcb.c b/nptl/nptl_free_tcb.c
> > index 0d31143ac849de6398e06c399b94813ae57dcff3..b63ec3bc3c1df4a1fc5272a24d453e679dbedd5e 100644
> > --- a/nptl/nptl_free_tcb.c
> > +++ b/nptl/nptl_free_tcb.c
> > @@ -24,7 +24,8 @@ void
> >  __nptl_free_tcb (struct pthread *pd)
> >  {
> >    /* The thread is exiting now.  */
> > -  if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> > +  if ((atomic_fetch_or_acquire (&pd->cancelhandling, 1 << TERMINATED_BIT)
> > +      & (1 << TERMINATED_BIT)) == 0)
>
> I think this change may cause regressions on x86 because at the moment
> it seems the only way to generate `lock bts` is with inline assembly:
>
> https://godbolt.org/z/bdchE53Ps

GCC Does have a pass for this:
https://github.com/gcc-mirror/gcc/blob/master/gcc/tree-ssa-ccp.cc
but it needs to be done in a very specific way:
https://godbolt.org/z/s9dchva6G

Can these remain macros that setup the bit_test_set properly but
use compiler intrinsics?
>
> Can we keep atomic bit_test_set?
> >      {
> >        /* Free TPP data.  */
> >        if (pd->tpp != NULL)
> > diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> > index 870a8fcb34eb43b58c2260fee6a4624f0fbbd469..4b7f3edc384748f300ca935ad878eb0e3547e163 100644
> > --- a/nptl/pthread_create.c
> > +++ b/nptl/pthread_create.c
> > @@ -487,7 +487,7 @@ start_thread (void *arg)
> >    /* The thread is exiting now.  Don't set this bit until after we've hit
> >       the event-reporting breakpoint, so that td_thr_get_info on us while at
> >       the breakpoint reports TD_THR_RUN state rather than TD_THR_ZOMBIE.  */
> > -  atomic_bit_set (&pd->cancelhandling, EXITING_BIT);
> > +  atomic_fetch_or_acquire (&pd->cancelhandling, 1 << EXITING_BIT);
> >
> >    if (__glibc_unlikely (atomic_decrement_and_test (&__nptl_nthreads)))
> >      /* This was the last thread.  */
> > diff --git a/sysdeps/alpha/atomic-machine.h b/sysdeps/alpha/atomic-machine.h
> > index df28b90261aa485125d951bc1abf76602730bfd0..115a9df5d77cd08bcb0a49d9b59f0c53b4a20d78 100644
> > --- a/sysdeps/alpha/atomic-machine.h
> > +++ b/sysdeps/alpha/atomic-machine.h
> > @@ -325,15 +325,6 @@
> >  #define atomic_exchange_and_add(mem, value) \
> >    __atomic_val_bysize (__arch_exchange_and_add, int, mem, value, __MB, __MB)
> >
> > -
> > -/* ??? Blah, I'm lazy.  Implement these later.  Can do better than the
> > -   compare-and-exchange loop provided by generic code.
> > -
> > -#define atomic_decrement_if_positive(mem)
> > -#define atomic_bit_test_set(mem, bit)
> > -
> > -*/
> > -
> >  #define atomic_full_barrier()  __asm ("mb" : : : "memory");
> >  #define atomic_read_barrier()  __asm ("mb" : : : "memory");
> >  #define atomic_write_barrier() __asm ("wmb" : : : "memory");
> > diff --git a/sysdeps/ia64/atomic-machine.h b/sysdeps/ia64/atomic-machine.h
> > index 71afcfe0d0031a6ceae5879a2b3b19ed2ee2f110..b2f5d2f4774cc2503c7595cb82f30f60fbcbe89c 100644
> > --- a/sysdeps/ia64/atomic-machine.h
> > +++ b/sysdeps/ia64/atomic-machine.h
> > @@ -77,20 +77,4 @@
> >       while (__builtin_expect (__val != __oldval, 0));                        \
> >       __oldval; })
> >
> > -#define atomic_bit_test_set(mem, bit) \
> > -  ({ __typeof (*mem) __oldval, __val;                                        \
> > -     __typeof (mem) __memp = (mem);                                          \
> > -     __typeof (*mem) __mask = ((__typeof (*mem)) 1 << (bit));                \
> > -                                                                             \
> > -     __val = (*__memp);                                                              \
> > -     do                                                                              \
> > -       {                                                                     \
> > -        __oldval = __val;                                                    \
> > -        __val = atomic_compare_and_exchange_val_acq (__memp,                 \
> > -                                                     __oldval | __mask,      \
> > -                                                     __oldval);              \
> > -       }                                                                     \
> > -     while (__builtin_expect (__val != __oldval, 0));                        \
> > -     __oldval & __mask; })
> > -
> >  #define atomic_full_barrier() __sync_synchronize ()
> > diff --git a/sysdeps/m68k/m680x0/m68020/atomic-machine.h b/sysdeps/m68k/m680x0/m68020/atomic-machine.h
> > index 70cda007341b49d28dc1d9847a88198b47589db4..72a3b81642c1a23207054092d16bb856556cd897 100644
> > --- a/sysdeps/m68k/m680x0/m68020/atomic-machine.h
> > +++ b/sysdeps/m68k/m680x0/m68020/atomic-machine.h
> > @@ -218,15 +218,3 @@
> >                            : "memory");                                       \
> >         }                                                                     \
> >       __result; })
> > -
> > -#define atomic_bit_set(mem, bit) \
> > -  __asm __volatile ("bfset %0{%1,#1}"                                        \
> > -                   : "+m" (*(mem))                                           \
> > -                   : "di" (sizeof (*(mem)) * 8 - (bit) - 1))
> > -
> > -#define atomic_bit_test_set(mem, bit) \
> > -  ({ char __result;                                                          \
> > -     __asm __volatile ("bfset %1{%2,#1}; sne %0"                             \
> > -                      : "=dm" (__result), "+m" (*(mem))                      \
> > -                      : "di" (sizeof (*(mem)) * 8 - (bit) - 1));             \
> > -     __result; })
> > diff --git a/sysdeps/nptl/pthreadP.h b/sysdeps/nptl/pthreadP.h
> > index 39af275c254ef3e737736bd5c38099bada8746d6..e8bd4c6dcbf7b7785857ff08c15dac9a5cfb2a5d 100644
> > --- a/sysdeps/nptl/pthreadP.h
> > +++ b/sysdeps/nptl/pthreadP.h
> > @@ -43,12 +43,6 @@
> >    atomic_compare_and_exchange_val_acq (&(descr)->member, new, old)
> >  #endif
> >
> > -#ifndef THREAD_ATOMIC_BIT_SET
> > -# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
> > -  atomic_bit_set (&(descr)->member, bit)
> > -#endif
> > -
> > -
> >  static inline short max_adaptive_count (void)
> >  {
> >  #if HAVE_TUNABLES
> > @@ -276,7 +270,7 @@ __do_cancel (void)
> >    struct pthread *self = THREAD_SELF;
> >
> >    /* Make sure we get no more cancellations.  */
> > -  atomic_bit_set (&self->cancelhandling, EXITING_BIT);
> > +  atomic_fetch_or_acquire (&self->cancelhandling, 1 << EXITING_BIT);
> >
> >    __pthread_unwind ((__pthread_unwind_buf_t *)
> >                     THREAD_GETMEM (self, cleanup_jmp_buf));
> > diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
> > index fe7f04cf30cf5b62fea743a72384f8fc248d2d25..db31e377970c4ab6285ef65fb21419db7c6ca373 100644
> > --- a/sysdeps/s390/atomic-machine.h
> > +++ b/sysdeps/s390/atomic-machine.h
> > @@ -93,17 +93,6 @@
> >      atomic_or_val (mem, mask);                 \
> >    } while (0)
> >
> > -/* Atomically *mem |= 1 << bit and return true if the bit was set in old value
> > -   of *mem.  */
> > -/* The load-and-or instruction is used on z196 zarch and higher cpus
> > -   instead of a loop with compare-and-swap instruction.  */
> > -#define atomic_bit_test_set(mem, bit)                                  \
> > -  ({ __typeof (*(mem)) __atg14_old;                                    \
> > -    __typeof (mem) __atg14_memp = (mem);                               \
> > -    __typeof (*(mem)) __atg14_mask = ((__typeof (*(mem))) 1 << (bit)); \
> > -    __atg14_old = atomic_or_val (__atg14_memp, __atg14_mask);          \
> > -    __atg14_old & __atg14_mask; })
> > -
> >  /* Atomically *mem &= mask and return the old value of *mem.  */
> >  /* The gcc builtin uses load-and-and instruction on z196 zarch and higher cpus
> >     instead of a loop with compare-and-swap instruction.  */
> > diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> > index deb6cadaa35e841e23cee55d169a20538bdb1f8d..c5eb5c639fb59d7395c0a2d8f4fd72452845914b 100644
> > --- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> > +++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> > @@ -159,10 +159,6 @@
> >
> >  # define atomic_max_relaxed(mem, value) asm_amo ("amomaxu", "", mem, value)
> >
> > -# define atomic_bit_test_set(mem, bit)                   \
> > -  ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
> > -     asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
> > -
> >  #else /* __riscv_atomic */
> >  # error "ISAs that do not subsume the A extension are not supported"
> >  #endif /* !__riscv_atomic */
> > diff --git a/sysdeps/unix/sysv/linux/sh/atomic-machine.h b/sysdeps/unix/sysv/linux/sh/atomic-machine.h
> > index aead3d5866d12b3b6bba27a1ef4a273400e1715b..3bf84d06bc875fca75b38b5ab01fca6683d390f7 100644
> > --- a/sysdeps/unix/sysv/linux/sh/atomic-machine.h
> > +++ b/sysdeps/unix/sysv/linux/sh/atomic-machine.h
> > @@ -301,97 +301,3 @@
> >
> >  #define atomic_increment_and_test(mem) atomic_add_zero((mem), 1)
> >  #define atomic_decrement_and_test(mem) atomic_add_zero((mem), -1)
> > -
> > -#define atomic_bit_set(mem, bit) \
> > -  (void) ({ unsigned int __mask = 1 << (bit); \
> > -           if (sizeof (*(mem)) == 1) \
> > -             __asm __volatile ("\
> > -               mova 1f,r0\n\
> > -               mov r15,r1\n\
> > -               .align 2\n\
> > -               mov #(0f-1f),r15\n\
> > -            0: mov.b @%0,r2\n\
> > -               or %1,r2\n\
> > -               mov.b r2,@%0\n\
> > -            1: mov r1,r15"\
> > -               : : "u" (mem), "u" (__mask) \
> > -               : "r0", "r1", "r2", "memory"); \
> > -           else if (sizeof (*(mem)) == 2) \
> > -             __asm __volatile ("\
> > -               mova 1f,r0\n\
> > -               mov r15,r1\n\
> > -               .align 2\n\
> > -               mov #(0f-1f),r15\n\
> > -            0: mov.w @%0,r2\n\
> > -               or %1,r2\n\
> > -               mov.w r2,@%0\n\
> > -            1: mov r1,r15"\
> > -               : : "u" (mem), "u" (__mask) \
> > -               : "r0", "r1", "r2", "memory"); \
> > -           else if (sizeof (*(mem)) == 4) \
> > -             __asm __volatile ("\
> > -               mova 1f,r0\n\
> > -               mov r15,r1\n\
> > -               .align 2\n\
> > -               mov #(0f-1f),r15\n\
> > -            0: mov.l @%0,r2\n\
> > -               or %1,r2\n\
> > -               mov.l r2,@%0\n\
> > -            1: mov r1,r15"\
> > -               : : "u" (mem), "u" (__mask) \
> > -               : "r0", "r1", "r2", "memory"); \
> > -           else \
> > -             abort (); \
> > -           })
> > -
> > -#define atomic_bit_test_set(mem, bit) \
> > -  ({ unsigned int __mask = 1 << (bit); \
> > -     unsigned int __result = __mask; \
> > -     if (sizeof (*(mem)) == 1) \
> > -       __asm __volatile ("\
> > -         mova 1f,r0\n\
> > -         .align 2\n\
> > -         mov r15,r1\n\
> > -         mov #(0f-1f),r15\n\
> > -       0: mov.b @%2,r2\n\
> > -         mov r2,r3\n\
> > -         or %1,r2\n\
> > -         mov.b r2,@%2\n\
> > -       1: mov r1,r15\n\
> > -         and r3,%0"\
> > -       : "=&r" (__result), "=&r" (__mask) \
> > -       : "u" (mem), "0" (__result), "1" (__mask) \
> > -       : "r0", "r1", "r2", "r3", "memory");    \
> > -     else if (sizeof (*(mem)) == 2) \
> > -       __asm __volatile ("\
> > -         mova 1f,r0\n\
> > -         .align 2\n\
> > -         mov r15,r1\n\
> > -         mov #(0f-1f),r15\n\
> > -       0: mov.w @%2,r2\n\
> > -         mov r2,r3\n\
> > -         or %1,r2\n\
> > -         mov.w %1,@%2\n\
> > -       1: mov r1,r15\n\
> > -         and r3,%0"\
> > -       : "=&r" (__result), "=&r" (__mask) \
> > -       : "u" (mem), "0" (__result), "1" (__mask) \
> > -       : "r0", "r1", "r2", "r3", "memory"); \
> > -     else if (sizeof (*(mem)) == 4) \
> > -       __asm __volatile ("\
> > -         mova 1f,r0\n\
> > -         .align 2\n\
> > -         mov r15,r1\n\
> > -         mov #(0f-1f),r15\n\
> > -       0: mov.l @%2,r2\n\
> > -         mov r2,r3\n\
> > -         or r2,%1\n\
> > -         mov.l %1,@%2\n\
> > -       1: mov r1,r15\n\
> > -         and r3,%0"\
> > -       : "=&r" (__result), "=&r" (__mask) \
> > -       : "u" (mem), "0" (__result), "1" (__mask) \
> > -       : "r0", "r1", "r2", "r3", "memory"); \
> > -     else \
> > -       abort (); \
> > -     __result; })
> > diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
> > index c27a437ec1b129fc18ca832708a69627959fc107..6adc219bf69f1507624618ed931dad11fea4e150 100644
> > --- a/sysdeps/x86/atomic-machine.h
> > +++ b/sysdeps/x86/atomic-machine.h
> > @@ -292,56 +292,6 @@
> >       __result; })
> >
> >
> > -#define atomic_bit_set(mem, bit) \
> > -  do {                                                                       \
> > -    if (sizeof (*mem) == 1)                                                  \
> > -      __asm __volatile (LOCK_PREFIX "orb %b2, %0"                            \
> > -                       : "=m" (*mem)                                         \
> > -                       : "m" (*mem), IBR_CONSTRAINT (1L << (bit)));          \
> > -    else if (sizeof (*mem) == 2)                                             \
> > -      __asm __volatile (LOCK_PREFIX "orw %w2, %0"                            \
> > -                       : "=m" (*mem)                                         \
> > -                       : "m" (*mem), "ir" (1L << (bit)));                    \
> > -    else if (sizeof (*mem) == 4)                                             \
> > -      __asm __volatile (LOCK_PREFIX "orl %2, %0"                             \
> > -                       : "=m" (*mem)                                         \
> > -                       : "m" (*mem), "ir" (1L << (bit)));                    \
> > -    else if (__builtin_constant_p (bit) && (bit) < 32)                       \
> > -      __asm __volatile (LOCK_PREFIX "orq %2, %0"                             \
> > -                       : "=m" (*mem)                                         \
> > -                       : "m" (*mem), "i" (1L << (bit)));                     \
> > -    else if (__HAVE_64B_ATOMICS)                                             \
> > -      __asm __volatile (LOCK_PREFIX "orq %q2, %0"                            \
> > -                       : "=m" (*mem)                                         \
> > -                       : "m" (*mem), "r" (1UL << (bit)));                    \
> > -    else                                                                     \
> > -      __atomic_link_error ();                                                \
> > -  } while (0)
> > -
> > -
> > -#define atomic_bit_test_set(mem, bit) \
> > -  ({ unsigned char __result;                                                 \
> > -     if (sizeof (*mem) == 1)                                                 \
> > -       __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0"                  \
> > -                        : "=q" (__result), "=m" (*mem)                       \
> > -                        : "m" (*mem), IBR_CONSTRAINT (bit));                 \
> > -     else if (sizeof (*mem) == 2)                                            \
> > -       __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0"                  \
> > -                        : "=q" (__result), "=m" (*mem)                       \
> > -                        : "m" (*mem), "ir" (bit));                           \
> > -     else if (sizeof (*mem) == 4)                                            \
> > -       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"                  \
> > -                        : "=q" (__result), "=m" (*mem)                       \
> > -                        : "m" (*mem), "ir" (bit));                           \
> > -     else if (__HAVE_64B_ATOMICS)                                            \
> > -       __asm __volatile (LOCK_PREFIX "btsq %3, %1; setc %0"                  \
> > -                        : "=q" (__result), "=m" (*mem)                       \
> > -                        : "m" (*mem), "ir" (bit));                           \
> > -     else                                                                    \
> > -       __atomic_link_error ();                                       \
> > -     __result; })
> > -
> > -
> >  #define __arch_and_body(lock, mem, mask) \
> >    do {                                                                       \
> >      if (sizeof (*mem) == 1)                                                  \
  
Wilco Dijkstra July 6, 2022, 6:59 p.m. UTC | #3
Hi Noah,

> > -  if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> > +  if ((atomic_fetch_or_acquire (&pd->cancelhandling, 1 << TERMINATED_BIT)
> > +      & (1 << TERMINATED_BIT)) == 0)

> Can these remain macros that setup the bit_test_set properly but
> use compiler intrinsics?

The fetch_or above is already written in a way that allows GCC to recognize it
(it's identical to your do_atomic_or0), so I don't see any issue here. However
there is a TERMINATED_BITMASK which might make it more readable.

Cheers,
Wilco
  
Noah Goldstein July 6, 2022, 7:14 p.m. UTC | #4
On Wed, Jul 6, 2022 at 12:00 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Noah,
>
> > > -  if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> > > +  if ((atomic_fetch_or_acquire (&pd->cancelhandling, 1 << TERMINATED_BIT)
> > > +      & (1 << TERMINATED_BIT)) == 0)
>
> > Can these remain macros that setup the bit_test_set properly but
> > use compiler intrinsics?
>
> The fetch_or above is already written in a way that allows GCC to recognize it
> (it's identical to your do_atomic_or0), so I don't see any issue here. However
> there is a TERMINATED_BITMASK which might make it more readable.

For non-constant shift values it needs to be like atomic_or4/atomic_or5.

Since this needs to be set up in a particular way it seems worth it to
keep it as a macro that will do it properly. Fine with replacing the underlying
atomic op with a compiler intrinsic.
>
> Cheers,
> Wilco
  
H.J. Lu July 6, 2022, 7:30 p.m. UTC | #5
On Wed, Jul 6, 2022 at 12:15 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Jul 6, 2022 at 12:00 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Hi Noah,
> >
> > > > -  if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> > > > +  if ((atomic_fetch_or_acquire (&pd->cancelhandling, 1 << TERMINATED_BIT)
> > > > +      & (1 << TERMINATED_BIT)) == 0)
> >
> > > Can these remain macros that setup the bit_test_set properly but
> > > use compiler intrinsics?
> >
> > The fetch_or above is already written in a way that allows GCC to recognize it
> > (it's identical to your do_atomic_or0), so I don't see any issue here. However
> > there is a TERMINATED_BITMASK which might make it more readable.
>
> For non-constant shift values it needs to be like atomic_or4/atomic_or5.
>
> Since this needs to be set up in a particular way it seems worth it to
> keep it as a macro that will do it properly. Fine with replacing the underlying
> atomic op with a compiler intrinsic.

GCC can optimize some atomic operations to locked bit operations.
Have we checked if the new assembly codes are as efficient as the
old ones on x86-64?
  
Wilco Dijkstra July 6, 2022, 7:36 p.m. UTC | #6
Hi Noah,

> For non-constant shift values it needs to be like atomic_or4/atomic_or5.
>
> Since this needs to be set up in a particular way it seems worth it to
> keep it as a macro that will do it properly. Fine with replacing the underlying
> atomic op with a compiler intrinsic.

This was only one use in GLIBC so how does adding a private macro help other
software? This idiom doesn't appear common, particularly with a non-constant bit,
but in principle the compiler could be fixed so that it optimizes do_atomic_or2/3
as well.

Cheers,
Wilco
  
Noah Goldstein July 6, 2022, 7:51 p.m. UTC | #7
On Wed, Jul 6, 2022 at 12:37 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Noah,
>
> > For non-constant shift values it needs to be like atomic_or4/atomic_or5.
> >
> > Since this needs to be set up in a particular way it seems worth it to
> > keep it as a macro that will do it properly. Fine with replacing the underlying
> > atomic op with a compiler intrinsic.
>
> This was only one use in GLIBC so how does adding a private macro help other
> software? This idiom doesn't appear common, particularly with a non-constant bit,
> but in principle the compiler could be fixed so that it optimizes do_atomic_or2/3
> as well.

The usage in this patch is fine. Just seems having an internal API for
it provides
value given that in the future there may be a need for atomic_bit_set
w.o a constant
bit and there are non-obvious tricks necessary for doing it right.


>
> Cheers,
> Wilco
  
Noah Goldstein July 6, 2022, 7:56 p.m. UTC | #8
On Wed, Jul 6, 2022 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Jul 6, 2022 at 12:37 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Hi Noah,
> >
> > > For non-constant shift values it needs to be like atomic_or4/atomic_or5.
> > >
> > > Since this needs to be set up in a particular way it seems worth it to
> > > keep it as a macro that will do it properly. Fine with replacing the underlying
> > > atomic op with a compiler intrinsic.
> >
> > This was only one use in GLIBC so how does adding a private macro help other
> > software? This idiom doesn't appear common, particularly with a non-constant bit,
> > but in principle the compiler could be fixed so that it optimizes do_atomic_or2/3
> > as well.
>
> The usage in this patch is fine. Just seems having an internal API for
> it provides
> value given that in the future there may be a need for atomic_bit_set
> w.o a constant
> bit and there are non-obvious tricks necessary for doing it right.

Also think frankly the API "atomic_bit_set" provides readability value
that is lost when you expand it.

The fetch_or_acquire seems like an implementation detail of
the desired functionality.
>
>
> >
> > Cheers,
> > Wilco
  
Wilco Dijkstra July 6, 2022, 8:14 p.m. UTC | #9
Hi Noah,

The goal here is to move to the standard atomics, not to invent our own set for
convenience or just for fun to be different etc. There were 2 uses of atomic_bit_set
in all of GLIBC, and I cannot believe that atomic_fetch_or (&x, 1 << bit) could cause
any confusion given that it is very similar to x |= 1 << bit. We don't wrap such
expressions in macros like bitset (x, bit) either - it just doesn't make sense.

As I mentioned, I'll use EXITING_BITMASK rather than 1 << EXITING_BIT.

Cheers,
Wilco
  
Noah Goldstein July 6, 2022, 8:30 p.m. UTC | #10
On Wed, Jul 6, 2022 at 1:14 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Noah,
>
> The goal here is to move to the standard atomics, not to invent our own set for
> convenience or just for fun to be different etc. There were 2 uses of atomic_bit_set
> in all of GLIBC, and I cannot believe that atomic_fetch_or (&x, 1 << bit) could cause
> any confusion given that it is very similar to x |= 1 << bit. We don't wrap such
> expressions in macros like bitset (x, bit) either - it just doesn't make sense.
>
> As I mentioned, I'll use EXITING_BITMASK rather than 1 << EXITING_BIT.

Alright.

>
> Cheers,
> Wilco
  
H.J. Lu July 6, 2022, 8:56 p.m. UTC | #11
On Wed, Jul 6, 2022 at 1:31 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Jul 6, 2022 at 1:14 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Hi Noah,
> >
> > The goal here is to move to the standard atomics, not to invent our own set for
> > convenience or just for fun to be different etc. There were 2 uses of atomic_bit_set
> > in all of GLIBC, and I cannot believe that atomic_fetch_or (&x, 1 << bit) could cause
> > any confusion given that it is very similar to x |= 1 << bit. We don't wrap such
> > expressions in macros like bitset (x, bit) either - it just doesn't make sense.
> >
> > As I mentioned, I'll use EXITING_BITMASK rather than 1 << EXITING_BIT.
>
> Alright.

Before GCC 12,

(__atomic_fetch_or (&x, MASK, __ATOMIC_ACQUIRE) & MASK) != 0;

will be optimized to "lock btsl" only if x is unsigned.  But cancelhandling in

if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)

is signed which leads to the much worse code when GCC 11 or older are
used.
  
Adhemerval Zanella Netto July 12, 2022, 5:47 p.m. UTC | #12
On 06/07/22 17:56, H.J. Lu via Libc-alpha wrote:
> On Wed, Jul 6, 2022 at 1:31 PM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> On Wed, Jul 6, 2022 at 1:14 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>>>
>>> Hi Noah,
>>>
>>> The goal here is to move to the standard atomics, not to invent our own set for
>>> convenience or just for fun to be different etc. There were 2 uses of atomic_bit_set
>>> in all of GLIBC, and I cannot believe that atomic_fetch_or (&x, 1 << bit) could cause
>>> any confusion given that it is very similar to x |= 1 << bit. We don't wrap such
>>> expressions in macros like bitset (x, bit) either - it just doesn't make sense.
>>>
>>> As I mentioned, I'll use EXITING_BITMASK rather than 1 << EXITING_BIT.
>>
>> Alright.
> 
> Before GCC 12,
> 
> (__atomic_fetch_or (&x, MASK, __ATOMIC_ACQUIRE) & MASK) != 0;
> 
> will be optimized to "lock btsl" only if x is unsigned.  But cancelhandling in
> 
> if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> 
> is signed which leads to the much worse code when GCC 11 or older are
> used.
> 

I think it should be safe to change cancelhandling to be unsigned,
although this is really a micro-optimization that we should handle
in the compiler instead roll-out our own atomics.
  
H.J. Lu July 12, 2022, 6:09 p.m. UTC | #13
On Tue, Jul 12, 2022 at 10:47 AM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 06/07/22 17:56, H.J. Lu via Libc-alpha wrote:
> > On Wed, Jul 6, 2022 at 1:31 PM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> On Wed, Jul 6, 2022 at 1:14 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >>>
> >>> Hi Noah,
> >>>
> >>> The goal here is to move to the standard atomics, not to invent our own set for
> >>> convenience or just for fun to be different etc. There were 2 uses of atomic_bit_set
> >>> in all of GLIBC, and I cannot believe that atomic_fetch_or (&x, 1 << bit) could cause
> >>> any confusion given that it is very similar to x |= 1 << bit. We don't wrap such
> >>> expressions in macros like bitset (x, bit) either - it just doesn't make sense.
> >>>
> >>> As I mentioned, I'll use EXITING_BITMASK rather than 1 << EXITING_BIT.
> >>
> >> Alright.
> >
> > Before GCC 12,
> >
> > (__atomic_fetch_or (&x, MASK, __ATOMIC_ACQUIRE) & MASK) != 0;
> >
> > will be optimized to "lock btsl" only if x is unsigned.  But cancelhandling in
> >
> > if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
> >
> > is signed which leads to the much worse code when GCC 11 or older are
> > used.
> >
>
> I think it should be safe to change cancelhandling to be unsigned,
> although this is really a micro-optimization that we should handle
> in the compiler instead roll-out our own atomics.

We should help older compilers in this case.  Can we change cancelhandling
to unsigned?

Thanks.
  
Wilco Dijkstra July 12, 2022, 6:44 p.m. UTC | #14
Hi,

I don't believe it could make a difference since it isn't performance critical code.
Note that several similar fetch_or/fetch_and are used which do not optimize into a
bitset operation, so even if we fix this particular case to use unsigned, there are many
others. IIRC there is even a fetch_or with zero which is kind of useless!

More importantly, it raises a question for locking: we currently use a compare-exchange
for each lock and unlock. If compare-exchange has higher overheads than other atomics
then we should investigate more efficient locking sequences.

Cheers,
Wilco
  
H.J. Lu July 12, 2022, 7:11 p.m. UTC | #15
On Tue, Jul 12, 2022 at 11:44 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi,
>
> I don't believe it could make a difference since it isn't performance critical code.
> Note that several similar fetch_or/fetch_and are used which do not optimize into a

Only the operations with test have this issue.  Simple fetch_or and
fetch_and are
OK.

> bitset operation, so even if we fix this particular case to use unsigned, there are many
> others. IIRC there is even a fetch_or with zero which is kind of useless!
>
> More importantly, it raises a question for locking: we currently use a compare-exchange
> for each lock and unlock. If compare-exchange has higher overheads than other atomics
> then we should investigate more efficient locking sequences.

compare-exchange can be very expensive under contention.

> Cheers,
> Wilco
  
Noah Goldstein July 12, 2022, 7:18 p.m. UTC | #16
On Tue, Jul 12, 2022 at 12:11 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 11:44 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Hi,
> >
> > I don't believe it could make a difference since it isn't performance critical code.
> > Note that several similar fetch_or/fetch_and are used which do not optimize into a
>
> Only the operations with test have this issue.  Simple fetch_or and
> fetch_and are
> OK.
>
> > bitset operation, so even if we fix this particular case to use unsigned, there are many
> > others. IIRC there is even a fetch_or with zero which is kind of useless!
> >
> > More importantly, it raises a question for locking: we currently use a compare-exchange
> > for each lock and unlock. If compare-exchange has higher overheads than other atomics
> > then we should investigate more efficient locking sequences.
>
> compare-exchange can be very expensive under contention.
>

Which locks should we look into dropping CAS? pthread_mutex? Anything else?
> > Cheers,
> > Wilco
>
>
>
> --
> H.J.
  
Wilco Dijkstra July 12, 2022, 8:39 p.m. UTC | #17
Hi,

>> > More importantly, it raises a question for locking: we currently use a compare-exchange
>> > for each lock and unlock. If compare-exchange has higher overheads than other atomics
>> > then we should investigate more efficient locking sequences.
>>
>> compare-exchange can be very expensive under contention.
>>
>
> Which locks should we look into dropping CAS? pthread_mutex? Anything else?

There are several internal locking schemes plus uses of CAS spread all over.
There is also the odd define ATOMIC_EXCHANGE_USES_CAS which almost no target
seems to know what to set to - it is used in pthread_spin_lock in a way that makes no
sense. I believe the answer should be 0 for all since exchange is typically simpler and
faster than compare-exchange. 

Cheers,
Wilco
  

Patch

diff --git a/include/atomic.h b/include/atomic.h
index 73cc772f0149f94fa0c3e14fa858fa89fee6985f..ed1fc38e7569fcbdd0473ab6f69956a44d62354f 100644
--- a/include/atomic.h
+++ b/include/atomic.h
@@ -255,28 +255,6 @@ 
 #endif
 
 
-#ifndef atomic_bit_set
-# define atomic_bit_set(mem, bit) \
-  (void) atomic_bit_test_set(mem, bit)
-#endif
-
-
-#ifndef atomic_bit_test_set
-# define atomic_bit_test_set(mem, bit) \
-  ({ __typeof (*(mem)) __atg14_old;					      \
-     __typeof (mem) __atg14_memp = (mem);				      \
-     __typeof (*(mem)) __atg14_mask = ((__typeof (*(mem))) 1 << (bit));	      \
-									      \
-     do									      \
-       __atg14_old = (*__atg14_memp);					      \
-     while (__builtin_expect						      \
-	    (atomic_compare_and_exchange_bool_acq (__atg14_memp,	      \
-						   __atg14_old | __atg14_mask,\
-						   __atg14_old), 0));	      \
-									      \
-     __atg14_old & __atg14_mask; })
-#endif
-
 /* Atomically *mem &= mask.  */
 #ifndef atomic_and
 # define atomic_and(mem, mask) \
diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
index 13acf0da8f1ab41774175401a68ce28a0eb7ed30..765b036873e181d6c49299c0ad08d5dbfcc05387 100644
--- a/misc/tst-atomic.c
+++ b/misc/tst-atomic.c
@@ -325,66 +325,6 @@  do_test (void)
       ret = 1;
     }
 
-  mem = 0;
-  atomic_bit_set (&mem, 1);
-  if (mem != 2)
-    {
-      puts ("atomic_bit_set test 1 failed");
-      ret = 1;
-    }
-
-  mem = 8;
-  atomic_bit_set (&mem, 3);
-  if (mem != 8)
-    {
-      puts ("atomic_bit_set test 2 failed");
-      ret = 1;
-    }
-
-#ifdef TEST_ATOMIC64
-  mem = 16;
-  atomic_bit_set (&mem, 35);
-  if (mem != 0x800000010LL)
-    {
-      puts ("atomic_bit_set test 3 failed");
-      ret = 1;
-    }
-#endif
-
-  mem = 0;
-  if (atomic_bit_test_set (&mem, 1)
-      || mem != 2)
-    {
-      puts ("atomic_bit_test_set test 1 failed");
-      ret = 1;
-    }
-
-  mem = 8;
-  if (! atomic_bit_test_set (&mem, 3)
-      || mem != 8)
-    {
-      puts ("atomic_bit_test_set test 2 failed");
-      ret = 1;
-    }
-
-#ifdef TEST_ATOMIC64
-  mem = 16;
-  if (atomic_bit_test_set (&mem, 35)
-      || mem != 0x800000010LL)
-    {
-      puts ("atomic_bit_test_set test 3 failed");
-      ret = 1;
-    }
-
-  mem = 0x100000000LL;
-  if (! atomic_bit_test_set (&mem, 32)
-      || mem != 0x100000000LL)
-    {
-      puts ("atomic_bit_test_set test 4 failed");
-      ret = 1;
-    }
-#endif
-
   /* Tests for C11-like atomics.  */
   mem = 11;
   if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
diff --git a/nptl/nptl_free_tcb.c b/nptl/nptl_free_tcb.c
index 0d31143ac849de6398e06c399b94813ae57dcff3..b63ec3bc3c1df4a1fc5272a24d453e679dbedd5e 100644
--- a/nptl/nptl_free_tcb.c
+++ b/nptl/nptl_free_tcb.c
@@ -24,7 +24,8 @@  void
 __nptl_free_tcb (struct pthread *pd)
 {
   /* The thread is exiting now.  */
-  if (atomic_bit_test_set (&pd->cancelhandling, TERMINATED_BIT) == 0)
+  if ((atomic_fetch_or_acquire (&pd->cancelhandling, 1 << TERMINATED_BIT)
+      & (1 << TERMINATED_BIT)) == 0)
     {
       /* Free TPP data.  */
       if (pd->tpp != NULL)
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 870a8fcb34eb43b58c2260fee6a4624f0fbbd469..4b7f3edc384748f300ca935ad878eb0e3547e163 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -487,7 +487,7 @@  start_thread (void *arg)
   /* The thread is exiting now.  Don't set this bit until after we've hit
      the event-reporting breakpoint, so that td_thr_get_info on us while at
      the breakpoint reports TD_THR_RUN state rather than TD_THR_ZOMBIE.  */
-  atomic_bit_set (&pd->cancelhandling, EXITING_BIT);
+  atomic_fetch_or_acquire (&pd->cancelhandling, 1 << EXITING_BIT);
 
   if (__glibc_unlikely (atomic_decrement_and_test (&__nptl_nthreads)))
     /* This was the last thread.  */
diff --git a/sysdeps/alpha/atomic-machine.h b/sysdeps/alpha/atomic-machine.h
index df28b90261aa485125d951bc1abf76602730bfd0..115a9df5d77cd08bcb0a49d9b59f0c53b4a20d78 100644
--- a/sysdeps/alpha/atomic-machine.h
+++ b/sysdeps/alpha/atomic-machine.h
@@ -325,15 +325,6 @@ 
 #define atomic_exchange_and_add(mem, value) \
   __atomic_val_bysize (__arch_exchange_and_add, int, mem, value, __MB, __MB)
 
-
-/* ??? Blah, I'm lazy.  Implement these later.  Can do better than the
-   compare-and-exchange loop provided by generic code.
-
-#define atomic_decrement_if_positive(mem)
-#define atomic_bit_test_set(mem, bit)
-
-*/
-
 #define atomic_full_barrier()	__asm ("mb" : : : "memory");
 #define atomic_read_barrier()	__asm ("mb" : : : "memory");
 #define atomic_write_barrier()	__asm ("wmb" : : : "memory");
diff --git a/sysdeps/ia64/atomic-machine.h b/sysdeps/ia64/atomic-machine.h
index 71afcfe0d0031a6ceae5879a2b3b19ed2ee2f110..b2f5d2f4774cc2503c7595cb82f30f60fbcbe89c 100644
--- a/sysdeps/ia64/atomic-machine.h
+++ b/sysdeps/ia64/atomic-machine.h
@@ -77,20 +77,4 @@ 
      while (__builtin_expect (__val != __oldval, 0));			      \
      __oldval; })
 
-#define atomic_bit_test_set(mem, bit) \
-  ({ __typeof (*mem) __oldval, __val;					      \
-     __typeof (mem) __memp = (mem);					      \
-     __typeof (*mem) __mask = ((__typeof (*mem)) 1 << (bit));		      \
-									      \
-     __val = (*__memp);							      \
-     do									      \
-       {								      \
-	 __oldval = __val;						      \
-	 __val = atomic_compare_and_exchange_val_acq (__memp,		      \
-						      __oldval | __mask,      \
-						      __oldval);	      \
-       }								      \
-     while (__builtin_expect (__val != __oldval, 0));			      \
-     __oldval & __mask; })
-
 #define atomic_full_barrier() __sync_synchronize ()
diff --git a/sysdeps/m68k/m680x0/m68020/atomic-machine.h b/sysdeps/m68k/m680x0/m68020/atomic-machine.h
index 70cda007341b49d28dc1d9847a88198b47589db4..72a3b81642c1a23207054092d16bb856556cd897 100644
--- a/sysdeps/m68k/m680x0/m68020/atomic-machine.h
+++ b/sysdeps/m68k/m680x0/m68020/atomic-machine.h
@@ -218,15 +218,3 @@ 
 			   : "memory");					      \
        }								      \
      __result; })
-
-#define atomic_bit_set(mem, bit) \
-  __asm __volatile ("bfset %0{%1,#1}"					      \
-		    : "+m" (*(mem))					      \
-		    : "di" (sizeof (*(mem)) * 8 - (bit) - 1))
-
-#define atomic_bit_test_set(mem, bit) \
-  ({ char __result;							      \
-     __asm __volatile ("bfset %1{%2,#1}; sne %0"			      \
-		       : "=dm" (__result), "+m" (*(mem))		      \
-		       : "di" (sizeof (*(mem)) * 8 - (bit) - 1));	      \
-     __result; })
diff --git a/sysdeps/nptl/pthreadP.h b/sysdeps/nptl/pthreadP.h
index 39af275c254ef3e737736bd5c38099bada8746d6..e8bd4c6dcbf7b7785857ff08c15dac9a5cfb2a5d 100644
--- a/sysdeps/nptl/pthreadP.h
+++ b/sysdeps/nptl/pthreadP.h
@@ -43,12 +43,6 @@ 
   atomic_compare_and_exchange_val_acq (&(descr)->member, new, old)
 #endif
 
-#ifndef THREAD_ATOMIC_BIT_SET
-# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
-  atomic_bit_set (&(descr)->member, bit)
-#endif
-
-
 static inline short max_adaptive_count (void)
 {
 #if HAVE_TUNABLES
@@ -276,7 +270,7 @@  __do_cancel (void)
   struct pthread *self = THREAD_SELF;
 
   /* Make sure we get no more cancellations.  */
-  atomic_bit_set (&self->cancelhandling, EXITING_BIT);
+  atomic_fetch_or_acquire (&self->cancelhandling, 1 << EXITING_BIT);
 
   __pthread_unwind ((__pthread_unwind_buf_t *)
 		    THREAD_GETMEM (self, cleanup_jmp_buf));
diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
index fe7f04cf30cf5b62fea743a72384f8fc248d2d25..db31e377970c4ab6285ef65fb21419db7c6ca373 100644
--- a/sysdeps/s390/atomic-machine.h
+++ b/sysdeps/s390/atomic-machine.h
@@ -93,17 +93,6 @@ 
     atomic_or_val (mem, mask);			\
   } while (0)
 
-/* Atomically *mem |= 1 << bit and return true if the bit was set in old value
-   of *mem.  */
-/* The load-and-or instruction is used on z196 zarch and higher cpus
-   instead of a loop with compare-and-swap instruction.  */
-#define atomic_bit_test_set(mem, bit)					\
-  ({ __typeof (*(mem)) __atg14_old;					\
-    __typeof (mem) __atg14_memp = (mem);				\
-    __typeof (*(mem)) __atg14_mask = ((__typeof (*(mem))) 1 << (bit));	\
-    __atg14_old = atomic_or_val (__atg14_memp, __atg14_mask);		\
-    __atg14_old & __atg14_mask; })
-
 /* Atomically *mem &= mask and return the old value of *mem.  */
 /* The gcc builtin uses load-and-and instruction on z196 zarch and higher cpus
    instead of a loop with compare-and-swap instruction.  */
diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
index deb6cadaa35e841e23cee55d169a20538bdb1f8d..c5eb5c639fb59d7395c0a2d8f4fd72452845914b 100644
--- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
+++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
@@ -159,10 +159,6 @@ 
 
 # define atomic_max_relaxed(mem, value) asm_amo ("amomaxu", "", mem, value)
 
-# define atomic_bit_test_set(mem, bit)                   \
-  ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
-     asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
-
 #else /* __riscv_atomic */
 # error "ISAs that do not subsume the A extension are not supported"
 #endif /* !__riscv_atomic */
diff --git a/sysdeps/unix/sysv/linux/sh/atomic-machine.h b/sysdeps/unix/sysv/linux/sh/atomic-machine.h
index aead3d5866d12b3b6bba27a1ef4a273400e1715b..3bf84d06bc875fca75b38b5ab01fca6683d390f7 100644
--- a/sysdeps/unix/sysv/linux/sh/atomic-machine.h
+++ b/sysdeps/unix/sysv/linux/sh/atomic-machine.h
@@ -301,97 +301,3 @@ 
 
 #define atomic_increment_and_test(mem) atomic_add_zero((mem), 1)
 #define atomic_decrement_and_test(mem) atomic_add_zero((mem), -1)
-
-#define atomic_bit_set(mem, bit) \
-  (void) ({ unsigned int __mask = 1 << (bit); \
-	    if (sizeof (*(mem)) == 1) \
-	      __asm __volatile ("\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		.align 2\n\
-		mov #(0f-1f),r15\n\
-	     0: mov.b @%0,r2\n\
-		or %1,r2\n\
-		mov.b r2,@%0\n\
-	     1: mov r1,r15"\
-		: : "u" (mem), "u" (__mask) \
-		: "r0", "r1", "r2", "memory"); \
-	    else if (sizeof (*(mem)) == 2) \
-	      __asm __volatile ("\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		.align 2\n\
-		mov #(0f-1f),r15\n\
-	     0: mov.w @%0,r2\n\
-		or %1,r2\n\
-		mov.w r2,@%0\n\
-	     1: mov r1,r15"\
-		: : "u" (mem), "u" (__mask) \
-		: "r0", "r1", "r2", "memory"); \
-	    else if (sizeof (*(mem)) == 4) \
-	      __asm __volatile ("\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		.align 2\n\
-		mov #(0f-1f),r15\n\
-	     0: mov.l @%0,r2\n\
-		or %1,r2\n\
-		mov.l r2,@%0\n\
-	     1: mov r1,r15"\
-		: : "u" (mem), "u" (__mask) \
-		: "r0", "r1", "r2", "memory"); \
-	    else \
-	      abort (); \
-	    })
-
-#define atomic_bit_test_set(mem, bit) \
-  ({ unsigned int __mask = 1 << (bit); \
-     unsigned int __result = __mask; \
-     if (sizeof (*(mem)) == 1) \
-       __asm __volatile ("\
-	  mova 1f,r0\n\
-	  .align 2\n\
-	  mov r15,r1\n\
-	  mov #(0f-1f),r15\n\
-       0: mov.b @%2,r2\n\
-	  mov r2,r3\n\
-	  or %1,r2\n\
-	  mov.b r2,@%2\n\
-       1: mov r1,r15\n\
-	  and r3,%0"\
-	: "=&r" (__result), "=&r" (__mask) \
-	: "u" (mem), "0" (__result), "1" (__mask) \
-	: "r0", "r1", "r2", "r3", "memory");	\
-     else if (sizeof (*(mem)) == 2) \
-       __asm __volatile ("\
-	  mova 1f,r0\n\
-	  .align 2\n\
-	  mov r15,r1\n\
-	  mov #(0f-1f),r15\n\
-       0: mov.w @%2,r2\n\
-	  mov r2,r3\n\
-	  or %1,r2\n\
-	  mov.w %1,@%2\n\
-       1: mov r1,r15\n\
-	  and r3,%0"\
-	: "=&r" (__result), "=&r" (__mask) \
-	: "u" (mem), "0" (__result), "1" (__mask) \
-	: "r0", "r1", "r2", "r3", "memory"); \
-     else if (sizeof (*(mem)) == 4) \
-       __asm __volatile ("\
-	  mova 1f,r0\n\
-	  .align 2\n\
-	  mov r15,r1\n\
-	  mov #(0f-1f),r15\n\
-       0: mov.l @%2,r2\n\
-	  mov r2,r3\n\
-	  or r2,%1\n\
-	  mov.l %1,@%2\n\
-       1: mov r1,r15\n\
-	  and r3,%0"\
-	: "=&r" (__result), "=&r" (__mask) \
-	: "u" (mem), "0" (__result), "1" (__mask) \
-	: "r0", "r1", "r2", "r3", "memory"); \
-     else \
-       abort (); \
-     __result; })
diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
index c27a437ec1b129fc18ca832708a69627959fc107..6adc219bf69f1507624618ed931dad11fea4e150 100644
--- a/sysdeps/x86/atomic-machine.h
+++ b/sysdeps/x86/atomic-machine.h
@@ -292,56 +292,6 @@ 
      __result; })
 
 
-#define atomic_bit_set(mem, bit) \
-  do {									      \
-    if (sizeof (*mem) == 1)						      \
-      __asm __volatile (LOCK_PREFIX "orb %b2, %0"			      \
-			: "=m" (*mem)					      \
-			: "m" (*mem), IBR_CONSTRAINT (1L << (bit)));	      \
-    else if (sizeof (*mem) == 2)					      \
-      __asm __volatile (LOCK_PREFIX "orw %w2, %0"			      \
-			: "=m" (*mem)					      \
-			: "m" (*mem), "ir" (1L << (bit)));		      \
-    else if (sizeof (*mem) == 4)					      \
-      __asm __volatile (LOCK_PREFIX "orl %2, %0"			      \
-			: "=m" (*mem)					      \
-			: "m" (*mem), "ir" (1L << (bit)));		      \
-    else if (__builtin_constant_p (bit) && (bit) < 32)			      \
-      __asm __volatile (LOCK_PREFIX "orq %2, %0"			      \
-			: "=m" (*mem)					      \
-			: "m" (*mem), "i" (1L << (bit)));		      \
-    else if (__HAVE_64B_ATOMICS)					      \
-      __asm __volatile (LOCK_PREFIX "orq %q2, %0"			      \
-			: "=m" (*mem)					      \
-			: "m" (*mem), "r" (1UL << (bit)));		      \
-    else								      \
-      __atomic_link_error ();						      \
-  } while (0)
-
-
-#define atomic_bit_test_set(mem, bit) \
-  ({ unsigned char __result;						      \
-     if (sizeof (*mem) == 1)						      \
-       __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0"		      \
-			 : "=q" (__result), "=m" (*mem)			      \
-			 : "m" (*mem), IBR_CONSTRAINT (bit));		      \
-     else if (sizeof (*mem) == 2)					      \
-       __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0"		      \
-			 : "=q" (__result), "=m" (*mem)			      \
-			 : "m" (*mem), "ir" (bit));			      \
-     else if (sizeof (*mem) == 4)					      \
-       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"		      \
-			 : "=q" (__result), "=m" (*mem)			      \
-			 : "m" (*mem), "ir" (bit));			      \
-     else if (__HAVE_64B_ATOMICS)					      \
-       __asm __volatile (LOCK_PREFIX "btsq %3, %1; setc %0"		      \
-			 : "=q" (__result), "=m" (*mem)			      \
-			 : "m" (*mem), "ir" (bit));			      \
-     else							      	      \
-       __atomic_link_error ();					      \
-     __result; })
-
-
 #define __arch_and_body(lock, mem, mask) \
   do {									      \
     if (sizeof (*mem) == 1)						      \