[v3] Remove catomics

Message ID AM5PR0801MB1668A36795A0E139F60469D3839C9@AM5PR0801MB1668.eurprd08.prod.outlook.com
State Superseded
Headers
Series [v3] Remove catomics |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Wilco Dijkstra Aug. 3, 2022, 11:42 a.m. UTC
  v3: rebased to latest GLIBC, keep COMPARE_AND_SWAP on ia64

The catomics are not supported on most targets and are only used in a few places which are not
performance critical, so replace all uses with more standard atomics.
Replace uses of catomic_add, catomic_increment, catomic_decrement and catomic_fetch_and_add with
atomic_fetch_add_relaxed which maps to a standard compiler builtin. Relaxed memory ordering is
correct for simple counters since they only need atomicity.

Passes regress on AArch64 and build-many-glibcs

---
  

Patch

diff --git a/elf/dl-fptr.c b/elf/dl-fptr.c
index 6645a260b809ecd521796e0d1adee56b3e0bd993..d6e63b807b597b886562657da2d007fc9053be72 100644
--- a/elf/dl-fptr.c
+++ b/elf/dl-fptr.c
@@ -40,7 +40,7 @@ 
 
 #ifndef COMPARE_AND_SWAP
 # define COMPARE_AND_SWAP(ptr, old, new) \
-  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
+  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
 #endif
 
 ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
diff --git a/elf/dl-profile.c b/elf/dl-profile.c
index ec57e3a96552ae6460c22a0fcc819b85d486c0da..0af1f577d2d695d08edce9e13d9b39f77911b1d5 100644
--- a/elf/dl-profile.c
+++ b/elf/dl-profile.c
@@ -548,24 +548,24 @@  _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
 	      size_t newfromidx;
 	      to_index = (data[narcs].self_pc
 			  / (HASHFRACTION * sizeof (*tos)));
-	      newfromidx = catomic_exchange_and_add (&fromidx, 1) + 1;
+	      newfromidx = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
 	      froms[newfromidx].here = &data[narcs];
 	      froms[newfromidx].link = tos[to_index];
 	      tos[to_index] = newfromidx;
-	      catomic_increment (&narcs);
+	      atomic_fetch_add_relaxed (&narcs, 1);
 	    }
 
 	  /* If we still have no entry stop searching and insert.  */
 	  if (*topcindex == 0)
 	    {
-	      unsigned int newarc = catomic_exchange_and_add (narcsp, 1);
+	      unsigned int newarc = atomic_fetch_add_relaxed (narcsp, 1);
 
 	      /* In rare cases it could happen that all entries in FROMS are
 		 occupied.  So we cannot count this anymore.  */
 	      if (newarc >= fromlimit)
 		goto done;
 
-	      *topcindex = catomic_exchange_and_add (&fromidx, 1) + 1;
+	      *topcindex = atomic_fetch_add_relaxed (&fromidx, 1) + 1;
 	      fromp = &froms[*topcindex];
 
 	      fromp->here = &data[newarc];
@@ -573,7 +573,7 @@  _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
 	      data[newarc].self_pc = selfpc;
 	      data[newarc].count = 0;
 	      fromp->link = 0;
-	      catomic_increment (&narcs);
+	      atomic_fetch_add_relaxed (&narcs, 1);
 
 	      break;
 	    }
@@ -586,7 +586,7 @@  _dl_mcount (ElfW(Addr) frompc, ElfW(Addr) selfpc)
     }
 
   /* Increment the counter.  */
-  catomic_increment (&fromp->here->count);
+  atomic_fetch_add_relaxed (&fromp->here->count, 1);
 
  done:
   ;
diff --git a/include/atomic.h b/include/atomic.h
index 2cb52c9cfd894308b97b97a04dd574b2287bf1b2..264db9a0b7619ff6520f84a19c53c1eb9a3b42a3 100644
--- a/include/atomic.h
+++ b/include/atomic.h
@@ -24,13 +24,6 @@ 
    - atomic arithmetic and logic operation on memory.  They all
      have the prefix "atomic_".
 
-   - conditionally atomic operations of the same kinds.  These
-     always behave identical but can be faster when atomicity
-     is not really needed since only one thread has access to
-     the memory location.  In that case the code is slower in
-     the multi-thread case.  The interfaces have the prefix
-     "catomic_".
-
    - support functions like barriers.  They also have the prefix
      "atomic_".
 
@@ -93,29 +86,6 @@ 
 #endif
 
 
-#ifndef catomic_compare_and_exchange_val_acq
-# ifdef __arch_c_compare_and_exchange_val_32_acq
-#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
-  __atomic_val_bysize (__arch_c_compare_and_exchange_val,acq,		      \
-		       mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_val_acq(mem, newval, oldval) \
-  atomic_compare_and_exchange_val_acq (mem, newval, oldval)
-# endif
-#endif
-
-
-#ifndef catomic_compare_and_exchange_val_rel
-# ifndef atomic_compare_and_exchange_val_rel
-#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  catomic_compare_and_exchange_val_acq (mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  atomic_compare_and_exchange_val_rel (mem, newval, oldval)
-# endif
-#endif
-
-
 #ifndef atomic_compare_and_exchange_val_rel
 # define atomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
   atomic_compare_and_exchange_val_acq (mem, newval, oldval)
@@ -141,23 +111,6 @@ 
 #endif
 
 
-#ifndef catomic_compare_and_exchange_bool_acq
-# ifdef __arch_c_compare_and_exchange_bool_32_acq
-#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
-  __atomic_bool_bysize (__arch_c_compare_and_exchange_bool,acq,		      \
-		        mem, newval, oldval)
-# else
-#  define catomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
-  ({ /* Cannot use __oldval here, because macros later in this file might     \
-	call this macro with __oldval argument.	 */			      \
-     __typeof (oldval) __atg4_old = (oldval);				      \
-     catomic_compare_and_exchange_val_acq (mem, newval, __atg4_old)	      \
-       != __atg4_old;							      \
-  })
-# endif
-#endif
-
-
 /* Store NEWVALUE in *MEM and return the old value.  */
 #ifndef atomic_exchange_acq
 # define atomic_exchange_acq(mem, newvalue) \
@@ -212,24 +165,6 @@ 
   atomic_exchange_and_add_acq(mem, value)
 #endif
 
-#ifndef catomic_exchange_and_add
-# define catomic_exchange_and_add(mem, value) \
-  ({ __typeof (*(mem)) __atg7_oldv;					      \
-     __typeof (mem) __atg7_memp = (mem);				      \
-     __typeof (*(mem)) __atg7_value = (value);				      \
-									      \
-     do									      \
-       __atg7_oldv = *__atg7_memp;					      \
-     while (__builtin_expect						      \
-	    (catomic_compare_and_exchange_bool_acq (__atg7_memp,	      \
-						    __atg7_oldv		      \
-						    + __atg7_value,	      \
-						    __atg7_oldv), 0));	      \
-									      \
-     __atg7_oldv; })
-#endif
-
-
 #ifndef atomic_max
 # define atomic_max(mem, value) \
   do {									      \
@@ -246,25 +181,6 @@ 
   } while (0)
 #endif
 
-
-#ifndef catomic_max
-# define catomic_max(mem, value) \
-  do {									      \
-    __typeof (*(mem)) __atg9_oldv;					      \
-    __typeof (mem) __atg9_memp = (mem);					      \
-    __typeof (*(mem)) __atg9_value = (value);				      \
-    do {								      \
-      __atg9_oldv = *__atg9_memp;					      \
-      if (__atg9_oldv >= __atg9_value)					      \
-	break;								      \
-    } while (__builtin_expect						      \
-	     (catomic_compare_and_exchange_bool_acq (__atg9_memp,	      \
-						     __atg9_value,	      \
-						     __atg9_oldv), 0));	      \
-  } while (0)
-#endif
-
-
 #ifndef atomic_min
 # define atomic_min(mem, value) \
   do {									      \
@@ -288,32 +204,16 @@ 
 #endif
 
 
-#ifndef catomic_add
-# define catomic_add(mem, value) \
-  (void) catomic_exchange_and_add ((mem), (value))
-#endif
-
-
 #ifndef atomic_increment
 # define atomic_increment(mem) atomic_add ((mem), 1)
 #endif
 
 
-#ifndef catomic_increment
-# define catomic_increment(mem) catomic_add ((mem), 1)
-#endif
-
-
 #ifndef atomic_increment_val
 # define atomic_increment_val(mem) (atomic_exchange_and_add ((mem), 1) + 1)
 #endif
 
 
-#ifndef catomic_increment_val
-# define catomic_increment_val(mem) (catomic_exchange_and_add ((mem), 1) + 1)
-#endif
-
-
 /* Add one to *MEM and return true iff it's now zero.  */
 #ifndef atomic_increment_and_test
 # define atomic_increment_and_test(mem) \
@@ -326,21 +226,11 @@ 
 #endif
 
 
-#ifndef catomic_decrement
-# define catomic_decrement(mem) catomic_add ((mem), -1)
-#endif
-
-
 #ifndef atomic_decrement_val
 # define atomic_decrement_val(mem) (atomic_exchange_and_add ((mem), -1) - 1)
 #endif
 
 
-#ifndef catomic_decrement_val
-# define catomic_decrement_val(mem) (catomic_exchange_and_add ((mem), -1) - 1)
-#endif
-
-
 /* Subtract 1 from *MEM and return true iff it's now zero.  */
 #ifndef atomic_decrement_and_test
 # define atomic_decrement_and_test(mem) \
@@ -421,22 +311,6 @@ 
   } while (0)
 #endif
 
-#ifndef catomic_and
-# define catomic_and(mem, mask) \
-  do {									      \
-    __typeof (*(mem)) __atg20_old;					      \
-    __typeof (mem) __atg20_memp = (mem);				      \
-    __typeof (*(mem)) __atg20_mask = (mask);				      \
-									      \
-    do									      \
-      __atg20_old = (*__atg20_memp);					      \
-    while (__builtin_expect						      \
-	   (catomic_compare_and_exchange_bool_acq (__atg20_memp,	      \
-						   __atg20_old & __atg20_mask,\
-						   __atg20_old), 0));	      \
-  } while (0)
-#endif
-
 /* Atomically *mem &= mask and return the old value of *mem.  */
 #ifndef atomic_and_val
 # define atomic_and_val(mem, mask) \
@@ -471,22 +345,6 @@ 
   } while (0)
 #endif
 
-#ifndef catomic_or
-# define catomic_or(mem, mask) \
-  do {									      \
-    __typeof (*(mem)) __atg18_old;					      \
-    __typeof (mem) __atg18_memp = (mem);				      \
-    __typeof (*(mem)) __atg18_mask = (mask);				      \
-									      \
-    do									      \
-      __atg18_old = (*__atg18_memp);					      \
-    while (__builtin_expect						      \
-	   (catomic_compare_and_exchange_bool_acq (__atg18_memp,	      \
-						   __atg18_old | __atg18_mask,\
-						   __atg18_old), 0));	      \
-  } while (0)
-#endif
-
 /* Atomically *mem |= mask and return the old value of *mem.  */
 #ifndef atomic_or_val
 # define atomic_or_val(mem, mask) \
diff --git a/malloc/arena.c b/malloc/arena.c
index defd25c8a6850188824c1e51f41845ace11e1060..5fe0984379fad21be06e729e73a7dfba53a4624b 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -953,11 +953,11 @@  arena_get2 (size_t size, mstate avoid_arena)
          enough address space to create that many arenas.  */
       if (__glibc_unlikely (n <= narenas_limit - 1))
         {
-          if (catomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
+          if (atomic_compare_and_exchange_bool_acq (&narenas, n + 1, n))
             goto repeat;
           a = _int_new_arena (size);
 	  if (__glibc_unlikely (a == NULL))
-            catomic_decrement (&narenas);
+            atomic_fetch_add_relaxed (&narenas, -1);
         }
       else
         a = reused_arena (avoid_arena);
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 914052eb694dae3e0323b2a0c8a6538314ba1788..a496fb69e77858e15fa2688910778d961f1ada7b 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2464,11 +2464,11 @@  sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
     }
 
   /* update statistics */
-  int new = atomic_exchange_and_add (&mp_.n_mmaps, 1) + 1;
+  int new = atomic_fetch_add_relaxed (&mp_.n_mmaps, 1) + 1;
   atomic_max (&mp_.max_n_mmaps, new);
 
   unsigned long sum;
-  sum = atomic_exchange_and_add (&mp_.mmapped_mem, size) + size;
+  sum = atomic_fetch_add_relaxed (&mp_.mmapped_mem, size) + size;
   atomic_max (&mp_.max_mmapped_mem, sum);
 
   check_chunk (av, p);
@@ -3037,8 +3037,8 @@  munmap_chunk (mchunkptr p)
       || __glibc_unlikely (!powerof2 (mem & (pagesize - 1))))
     malloc_printerr ("munmap_chunk(): invalid pointer");
 
-  atomic_decrement (&mp_.n_mmaps);
-  atomic_add (&mp_.mmapped_mem, -total_size);
+  atomic_fetch_add_relaxed (&mp_.n_mmaps, -1);
+  atomic_fetch_add_relaxed (&mp_.mmapped_mem, -total_size);
 
   /* If munmap failed the process virtual memory address space is in a
      bad shape.  Just leave the block hanging around, the process will
@@ -3088,7 +3088,7 @@  mremap_chunk (mchunkptr p, size_t new_size)
   set_head (p, (new_size - offset) | IS_MMAPPED);
 
   INTERNAL_SIZE_T new;
-  new = atomic_exchange_and_add (&mp_.mmapped_mem, new_size - size - offset)
+  new = atomic_fetch_add_relaxed (&mp_.mmapped_mem, new_size - size - offset)
         + new_size - size - offset;
   atomic_max (&mp_.max_mmapped_mem, new);
   return p;
@@ -3812,7 +3812,7 @@  _int_malloc (mstate av, size_t bytes)
       if (__glibc_unlikely (pp != NULL && misaligned_chunk (pp)))       \
 	malloc_printerr ("malloc(): unaligned fastbin chunk detected"); \
     }							\
-  while ((pp = catomic_compare_and_exchange_val_acq (fb, pp, victim)) \
+  while ((pp = atomic_compare_and_exchange_val_acq (fb, pp, victim)) \
 	 != victim);					\
 
   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
@@ -4530,7 +4530,7 @@  _int_free (mstate av, mchunkptr p, int have_lock)
 	  old2 = old;
 	  p->fd = PROTECT_PTR (&p->fd, old);
 	}
-      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
+      while ((old = atomic_compare_and_exchange_val_rel (fb, p, old2))
 	     != old2);
 
     /* Check that size of fastbin chunk at the top is the same as
diff --git a/malloc/memusage.c b/malloc/memusage.c
index f30906dffb2731c104ea375af48f59c65bcc7c9c..74712834fa8b96fb2d9589d34b34ab07d05a84ca 100644
--- a/malloc/memusage.c
+++ b/malloc/memusage.c
@@ -148,8 +148,8 @@  update_data (struct header *result, size_t len, size_t old_len)
 
   /* Compute current heap usage and compare it with the maximum value.  */
   size_t heap
-    = catomic_exchange_and_add (&current_heap, len - old_len) + len - old_len;
-  catomic_max (&peak_heap, heap);
+    = atomic_fetch_add_relaxed (&current_heap, len - old_len) + len - old_len;
+  atomic_max (&peak_heap, heap);
 
   /* Compute current stack usage and compare it with the maximum
      value.  The base stack pointer might not be set if this is not
@@ -172,15 +172,15 @@  update_data (struct header *result, size_t len, size_t old_len)
     start_sp = sp;
   size_t current_stack = start_sp - sp;
 #endif
-  catomic_max (&peak_stack, current_stack);
+  atomic_max (&peak_stack, current_stack);
 
   /* Add up heap and stack usage and compare it with the maximum value.  */
-  catomic_max (&peak_total, heap + current_stack);
+  atomic_max (&peak_total, heap + current_stack);
 
   /* Store the value only if we are writing to a file.  */
   if (fd != -1)
     {
-      uint32_t idx = catomic_exchange_and_add (&buffer_cnt, 1);
+      uint32_t idx = atomic_fetch_add_relaxed (&buffer_cnt, 1);
       if (idx + 1 >= 2 * buffer_size)
         {
           /* We try to reset the counter to the correct range.  If
@@ -188,7 +188,7 @@  update_data (struct header *result, size_t len, size_t old_len)
              counter it does not matter since that thread will take
              care of the correction.  */
           uint32_t reset = (idx + 1) % (2 * buffer_size);
-          catomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
+          atomic_compare_and_exchange_val_acq (&buffer_cnt, reset, idx + 1);
           if (idx >= 2 * buffer_size)
             idx = reset - 1;
         }
@@ -362,24 +362,24 @@  malloc (size_t len)
     return (*mallocp)(len);
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_malloc]);
+  atomic_fetch_add_relaxed (&calls[idx_malloc], 1);
   /* Keep track of total memory consumption for `malloc'.  */
-  catomic_add (&total[idx_malloc], len);
+  atomic_fetch_add_relaxed (&total[idx_malloc], len);
   /* Keep track of total memory requirement.  */
-  catomic_add (&grand_total, len);
+  atomic_fetch_add_relaxed (&grand_total, len);
   /* Remember the size of the request.  */
   if (len < 65536)
-    catomic_increment (&histogram[len / 16]);
+    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
-  catomic_increment (&calls_total);
+  atomic_fetch_add_relaxed (&calls_total, 1);
 
   /* Do the real work.  */
   result = (struct header *) (*mallocp)(len + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_malloc]);
+      atomic_fetch_add_relaxed (&failed[idx_malloc], 1);
       return NULL;
     }
 
@@ -430,21 +430,21 @@  realloc (void *old, size_t len)
     }
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_realloc]);
+  atomic_fetch_add_relaxed (&calls[idx_realloc], 1);
   if (len > old_len)
     {
       /* Keep track of total memory consumption for `realloc'.  */
-      catomic_add (&total[idx_realloc], len - old_len);
+      atomic_fetch_add_relaxed (&total[idx_realloc], len - old_len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len - old_len);
+      atomic_fetch_add_relaxed (&grand_total, len - old_len);
     }
 
   if (len == 0 && old != NULL)
     {
       /* Special case.  */
-      catomic_increment (&realloc_free);
+      atomic_fetch_add_relaxed (&realloc_free, 1);
       /* Keep track of total memory freed using `free'.  */
-      catomic_add (&total[idx_free], real->length);
+      atomic_fetch_add_relaxed (&total[idx_free], real->length);
 
       /* Update the allocation data and write out the records if necessary.  */
       update_data (NULL, 0, old_len);
@@ -457,26 +457,26 @@  realloc (void *old, size_t len)
 
   /* Remember the size of the request.  */
   if (len < 65536)
-    catomic_increment (&histogram[len / 16]);
+    atomic_fetch_add_relaxed (&histogram[len / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
-  catomic_increment (&calls_total);
+  atomic_fetch_add_relaxed (&calls_total, 1);
 
   /* Do the real work.  */
   result = (struct header *) (*reallocp)(real, len + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_realloc]);
+      atomic_fetch_add_relaxed (&failed[idx_realloc], 1);
       return NULL;
     }
 
   /* Record whether the reduction/increase happened in place.  */
   if (real == result)
-    catomic_increment (&inplace);
+    atomic_fetch_add_relaxed (&inplace, 1);
   /* Was the buffer increased?  */
   if (old_len > len)
-    catomic_increment (&decreasing);
+    atomic_fetch_add_relaxed (&decreasing, 1);
 
   /* Update the allocation data and write out the records if necessary.  */
   update_data (result, len, old_len);
@@ -508,16 +508,16 @@  calloc (size_t n, size_t len)
     return (*callocp)(n, len);
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_calloc]);
+  atomic_fetch_add_relaxed (&calls[idx_calloc], 1);
   /* Keep track of total memory consumption for `calloc'.  */
-  catomic_add (&total[idx_calloc], size);
+  atomic_fetch_add_relaxed (&total[idx_calloc], size);
   /* Keep track of total memory requirement.  */
-  catomic_add (&grand_total, size);
+  atomic_fetch_add_relaxed (&grand_total, size);
   /* Remember the size of the request.  */
   if (size < 65536)
-    catomic_increment (&histogram[size / 16]);
+    atomic_fetch_add_relaxed (&histogram[size / 16], 1);
   else
-    catomic_increment (&large);
+    atomic_fetch_add_relaxed (&large, 1);
   /* Total number of calls of any of the functions.  */
   ++calls_total;
 
@@ -525,7 +525,7 @@  calloc (size_t n, size_t len)
   result = (struct header *) (*mallocp)(size + sizeof (struct header));
   if (result == NULL)
     {
-      catomic_increment (&failed[idx_calloc]);
+      atomic_fetch_add_relaxed (&failed[idx_calloc], 1);
       return NULL;
     }
 
@@ -563,7 +563,7 @@  free (void *ptr)
   /* `free (NULL)' has no effect.  */
   if (ptr == NULL)
     {
-      catomic_increment (&calls[idx_free]);
+      atomic_fetch_add_relaxed (&calls[idx_free], 1);
       return;
     }
 
@@ -577,9 +577,9 @@  free (void *ptr)
     }
 
   /* Keep track of number of calls.  */
-  catomic_increment (&calls[idx_free]);
+  atomic_fetch_add_relaxed (&calls[idx_free], 1);
   /* Keep track of total memory freed using `free'.  */
-  catomic_add (&total[idx_free], real->length);
+  atomic_fetch_add_relaxed (&total[idx_free], real->length);
 
   /* Update the allocation data and write out the records if necessary.  */
   update_data (NULL, 0, real->length);
@@ -614,22 +614,22 @@  mmap (void *start, size_t len, int prot, int flags, int fd, off_t offset)
                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
 
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx]);
+      atomic_fetch_add_relaxed (&calls[idx], 1);
       /* Keep track of total memory consumption for `malloc'.  */
-      catomic_add (&total[idx], len);
+      atomic_fetch_add_relaxed (&total[idx], len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len);
+      atomic_fetch_add_relaxed (&grand_total, len);
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx]);
+        atomic_fetch_add_relaxed (&failed[idx], 1);
       else if (idx == idx_mmap_w)
         /* Update the allocation data and write out the records if
            necessary.  Note the first parameter is NULL which means
@@ -667,22 +667,22 @@  mmap64 (void *start, size_t len, int prot, int flags, int fd, off64_t offset)
                  ? idx_mmap_a : prot & PROT_WRITE ? idx_mmap_w : idx_mmap_r);
 
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx]);
+      atomic_fetch_add_relaxed (&calls[idx], 1);
       /* Keep track of total memory consumption for `malloc'.  */
-      catomic_add (&total[idx], len);
+      atomic_fetch_add_relaxed (&total[idx], len);
       /* Keep track of total memory requirement.  */
-      catomic_add (&grand_total, len);
+      atomic_fetch_add_relaxed (&grand_total, len);
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx]);
+        atomic_fetch_add_relaxed (&failed[idx], 1);
       else if (idx == idx_mmap_w)
         /* Update the allocation data and write out the records if
            necessary.  Note the first parameter is NULL which means
@@ -722,33 +722,33 @@  mremap (void *start, size_t old_len, size_t len, int flags, ...)
   if (!not_me && trace_mmap)
     {
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx_mremap]);
+      atomic_fetch_add_relaxed (&calls[idx_mremap], 1);
       if (len > old_len)
         {
           /* Keep track of total memory consumption for `malloc'.  */
-          catomic_add (&total[idx_mremap], len - old_len);
+          atomic_fetch_add_relaxed (&total[idx_mremap], len - old_len);
           /* Keep track of total memory requirement.  */
-          catomic_add (&grand_total, len - old_len);
+          atomic_fetch_add_relaxed (&grand_total, len - old_len);
         }
       /* Remember the size of the request.  */
       if (len < 65536)
-        catomic_increment (&histogram[len / 16]);
+        atomic_fetch_add_relaxed (&histogram[len / 16], 1);
       else
-        catomic_increment (&large);
+        atomic_fetch_add_relaxed (&large, 1);
       /* Total number of calls of any of the functions.  */
-      catomic_increment (&calls_total);
+      atomic_fetch_add_relaxed (&calls_total, 1);
 
       /* Check for failures.  */
       if (result == NULL)
-        catomic_increment (&failed[idx_mremap]);
+        atomic_fetch_add_relaxed (&failed[idx_mremap], 1);
       else
         {
           /* Record whether the reduction/increase happened in place.  */
           if (start == result)
-            catomic_increment (&inplace_mremap);
+            atomic_fetch_add_relaxed (&inplace_mremap, 1);
           /* Was the buffer increased?  */
           if (old_len > len)
-            catomic_increment (&decreasing_mremap);
+            atomic_fetch_add_relaxed (&decreasing_mremap, 1);
 
           /* Update the allocation data and write out the records if
              necessary.  Note the first parameter is NULL which means
@@ -783,19 +783,19 @@  munmap (void *start, size_t len)
   if (!not_me && trace_mmap)
     {
       /* Keep track of number of calls.  */
-      catomic_increment (&calls[idx_munmap]);
+      atomic_fetch_add_relaxed (&calls[idx_munmap], 1);
 
       if (__glibc_likely (result == 0))
         {
           /* Keep track of total memory freed using `free'.  */
-          catomic_add (&total[idx_munmap], len);
+          atomic_fetch_add_relaxed (&total[idx_munmap], len);
 
           /* Update the allocation data and write out the records if
              necessary.  */
           update_data (NULL, 0, len);
         }
       else
-        catomic_increment (&failed[idx_munmap]);
+        atomic_fetch_add_relaxed (&failed[idx_munmap], 1);
     }
 
   return result;
diff --git a/manual/memory.texi b/manual/memory.texi
index 23a039c57e60c81787252d935e3b309fd8290902..5cb1dbd281006148f23cfa38c5703fb79089ba78 100644
--- a/manual/memory.texi
+++ b/manual/memory.texi
@@ -354,7 +354,7 @@  this function is in @file{stdlib.h}.
 @c that's protected by list_lock; next_free is only modified while
 @c list_lock is held too.  All other data members of an arena, as well
 @c as the metadata of the memory areas assigned to it, are only modified
-@c while holding the arena's mutex (fastbin pointers use catomic ops
+@c while holding the arena's mutex (fastbin pointers use atomic ops
 @c because they may be modified by free without taking the arena's
 @c lock).  Some reassurance was needed for fastbins, for it wasn't clear
 @c how they were initialized.  It turns out they are always
@@ -383,7 +383,7 @@  this function is in @file{stdlib.h}.
 @c     mutex_lock (arena lock) dup @asulock @aculock [returns locked]
 @c    __get_nprocs ext ok @acsfd
 @c    NARENAS_FROM_NCORES ok
-@c    catomic_compare_and_exchange_bool_acq ok
+@c    atomic_compare_and_exchange_bool_acq ok
 @c    _int_new_arena ok @asulock @aculock @acsmem
 @c     new_heap ok @acsmem
 @c      mmap ok @acsmem
@@ -397,7 +397,7 @@  this function is in @file{stdlib.h}.
 @c     mutex_lock (list_lock) dup @asulock @aculock
 @c     atomic_write_barrier ok
 @c     mutex_unlock (list_lock) @aculock
-@c    catomic_decrement ok
+@c    atomic_decrement ok
 @c    reused_arena @asulock @aculock
 @c      reads&writes next_to_use and iterates over arena next without guards
 @c      those are harmless as long as we don't drop arenas from the
@@ -414,7 +414,7 @@  this function is in @file{stdlib.h}.
 @c   get_max_fast ok
 @c   fastbin_index ok
 @c   fastbin ok
-@c   catomic_compare_and_exhange_val_acq ok
+@c   atomic_compare_and_exhange_val_acq ok
 @c   malloc_printerr dup @mtsenv
 @c     if we get to it, we're toast already, undefined behavior must have
 @c     been invoked before
@@ -521,10 +521,10 @@  this function is in @file{stdlib.h}.
 @c     chunk2mem dup ok
 @c     free_perturb ok
 @c     set_fastchunks ok
-@c      catomic_and ok
+@c      atomic_and ok
 @c     fastbin_index dup ok
 @c     fastbin dup ok
-@c     catomic_compare_and_exchange_val_rel ok
+@c     atomic_compare_and_exchange_val_rel ok
 @c     chunk_is_mmapped ok
 @c     contiguous dup ok
 @c     prev_inuse ok
@@ -706,7 +706,7 @@  The prototype for this function is in @file{stdlib.h}.
 @safety{@prelim{}@mtsafe{}@asunsafe{@asulock{}}@acunsafe{@aculock{} @acsfd{} @acsmem{}}}
 @c __libc_free @asulock @aculock @acsfd @acsmem
 @c   releasing memory into fastbins modifies the arena without taking
-@c   its mutex, but catomic operations ensure safety.  If two (or more)
+@c   its mutex, but atomic operations ensure safety.  If two (or more)
 @c   threads are running malloc and have their own arenas locked when
 @c   each gets a signal whose handler free()s large (non-fastbin-able)
 @c   blocks from each other's arena, we deadlock; this is a more general
diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
index 6d681a7bfdf4f48b4c04a073ebd480326dbd3cc8..4f9d2c1a46b363d346dbc2fa0962ae196844a43a 100644
--- a/misc/tst-atomic.c
+++ b/misc/tst-atomic.c
@@ -393,117 +393,6 @@  do_test (void)
     }
 #endif
 
-#ifdef catomic_compare_and_exchange_val_acq
-  mem = 24;
-  if (catomic_compare_and_exchange_val_acq (&mem, 35, 24) != 24
-      || mem != 35)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 1 failed");
-      ret = 1;
-    }
-
-  mem = 12;
-  if (catomic_compare_and_exchange_val_acq (&mem, 10, 15) != 12
-      || mem != 12)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 2 failed");
-      ret = 1;
-    }
-
-  mem = -15;
-  if (catomic_compare_and_exchange_val_acq (&mem, -56, -15) != -15
-      || mem != -56)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 3 failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  if (catomic_compare_and_exchange_val_acq (&mem, 17, 0) != -1
-      || mem != -1)
-    {
-      puts ("catomic_compare_and_exchange_val_acq test 4 failed");
-      ret = 1;
-    }
-#endif
-
-  mem = 24;
-  if (catomic_compare_and_exchange_bool_acq (&mem, 35, 24)
-      || mem != 35)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 1 failed");
-      ret = 1;
-    }
-
-  mem = 12;
-  if (! catomic_compare_and_exchange_bool_acq (&mem, 10, 15)
-      || mem != 12)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 2 failed");
-      ret = 1;
-    }
-
-  mem = -15;
-  if (catomic_compare_and_exchange_bool_acq (&mem, -56, -15)
-      || mem != -56)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 3 failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  if (! catomic_compare_and_exchange_bool_acq (&mem, 17, 0)
-      || mem != -1)
-    {
-      puts ("catomic_compare_and_exchange_bool_acq test 4 failed");
-      ret = 1;
-    }
-
-  mem = 2;
-  if (catomic_exchange_and_add (&mem, 11) != 2
-      || mem != 13)
-    {
-      puts ("catomic_exchange_and_add test failed");
-      ret = 1;
-    }
-
-  mem = -21;
-  catomic_add (&mem, 22);
-  if (mem != 1)
-    {
-      puts ("catomic_add test failed");
-      ret = 1;
-    }
-
-  mem = -1;
-  catomic_increment (&mem);
-  if (mem != 0)
-    {
-      puts ("catomic_increment test failed");
-      ret = 1;
-    }
-
-  mem = 2;
-  if (catomic_increment_val (&mem) != 3)
-    {
-      puts ("catomic_increment_val test failed");
-      ret = 1;
-    }
-
-  mem = 17;
-  catomic_decrement (&mem);
-  if (mem != 16)
-    {
-      puts ("catomic_decrement test failed");
-      ret = 1;
-    }
-
-  if (catomic_decrement_val (&mem) != 15)
-    {
-      puts ("catomic_decrement_val test failed");
-      ret = 1;
-    }
-
   /* Tests for C11-like atomics.  */
   mem = 11;
   if (atomic_load_relaxed (&mem) != 11 || atomic_load_acquire (&mem) != 11)
diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
index 9ed21602d6155d4b960278f8d1fac4ffa885b9d5..40bf5cd3b306315d8eeb6bdba2b2b46b1ea5059e 100644
--- a/sysdeps/hppa/dl-fptr.c
+++ b/sysdeps/hppa/dl-fptr.c
@@ -41,10 +41,8 @@ 
 # error "ELF_MACHINE_LOAD_ADDRESS is not defined."
 #endif
 
-#ifndef COMPARE_AND_SWAP
-# define COMPARE_AND_SWAP(ptr, old, new) \
-  (catomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
-#endif
+#define COMPARE_AND_SWAP(ptr, old, new) \
+  (atomic_compare_and_exchange_bool_acq (ptr, new, old) == 0)
 
 ElfW(Addr) _dl_boot_fptr_table [ELF_MACHINE_BOOT_FPTR_TABLE_LEN];
 
diff --git a/sysdeps/s390/atomic-machine.h b/sysdeps/s390/atomic-machine.h
index e85b2ef50541c7aab6d2981180f6205d2bd681b6..6b1de51c2a30baf5554a729a80a7ce04b56fc22c 100644
--- a/sysdeps/s390/atomic-machine.h
+++ b/sysdeps/s390/atomic-machine.h
@@ -70,8 +70,6 @@ 
     !__atomic_compare_exchange_n (mem, (void *) &__atg2_oldval, newval,	\
 				  1, __ATOMIC_ACQUIRE,			\
 				  __ATOMIC_RELAXED); })
-#define catomic_compare_and_exchange_bool_acq(mem, newval, oldval)	\
-  atomic_compare_and_exchange_bool_acq (mem, newval, oldval)
 
 /* Store NEWVALUE in *MEM and return the old value.  */
 #define atomic_exchange_acq(mem, newvalue)				\
@@ -90,8 +88,6 @@ 
 # define atomic_exchange_and_add_rel(mem, operand)			\
   ({ __atomic_check_size((mem));					\
   __atomic_fetch_add ((mem), (operand), __ATOMIC_RELEASE); })
-#define catomic_exchange_and_add(mem, value)	\
-  atomic_exchange_and_add (mem, value)
 
 /* Atomically *mem |= mask and return the old value of *mem.  */
 /* The gcc builtin uses load-and-or instruction on z196 zarch and higher cpus
@@ -104,8 +100,6 @@ 
   do {						\
     atomic_or_val (mem, mask);			\
   } while (0)
-#define catomic_or(mem, mask)			\
-  atomic_or (mem, mask)
 
 /* Atomically *mem |= 1 << bit and return true if the bit was set in old value
    of *mem.  */
@@ -129,5 +123,3 @@ 
   do {						\
     atomic_and_val (mem, mask);			\
   } while (0)
-#define catomic_and(mem, mask)			\
-  atomic_and(mem, mask)
diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
index 9ae89e0ef12ad28319755ac51260908779b9579f..f4b2cbced828a80335887bf172fd60767cf978ac 100644
--- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
+++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
@@ -170,10 +170,6 @@ 
   ({ typeof (*mem) __mask = (typeof (*mem))1 << (bit);    \
      asm_amo ("amoor", ".aq", mem, __mask) & __mask; })
 
-# define catomic_exchange_and_add(mem, value)		\
-  atomic_exchange_and_add (mem, value)
-# define catomic_max(mem, value) atomic_max (mem, value)
-
 #else /* __riscv_atomic */
 # error "ISAs that do not subsume the A extension are not supported"
 #endif /* !__riscv_atomic */
diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
index f24f1c71ed718c601c71decc1ee0c4b49fdf32f8..ffd059618878be42c05fb21cd51b7434a6f37637 100644
--- a/sysdeps/x86/atomic-machine.h
+++ b/sysdeps/x86/atomic-machine.h
@@ -20,7 +20,7 @@ 
 #define _X86_ATOMIC_MACHINE_H 1
 
 #include <stdint.h>
-#include <tls.h>			/* For tcbhead_t.  */
+#include <tls.h>			/* For mach.  */
 #include <libc-pointer-arith.h>		/* For cast_to_integer.  */
 
 #define LOCK_PREFIX "lock;"
@@ -52,52 +52,7 @@ 
   (! __sync_bool_compare_and_swap (mem, oldval, newval))
 
 
-#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgb %b2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgw %w2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-
-#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgl %2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
-     ret; })
-
 #ifdef __x86_64__
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgq %q2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : "q" ((int64_t) cast_to_integer (newval)),	      \
-			 "m" (*mem),					      \
-			 "0" ((int64_t) cast_to_integer (oldval)),	      \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
 # define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
 # define do_add_val_64_acq(pfx, mem, value) do { } while (0)
 #else
@@ -107,13 +62,6 @@ 
    such an operation.  So don't define any code for now.  If it is
    really going to be used the code below can be used on Intel Pentium
    and later, but NOT on i486.  */
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret = *(mem);					      \
-     __atomic_link_error ();						      \
-     ret = (newval);							      \
-     ret = (oldval);							      \
-     ret; })
-
 # define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret = *(mem);					      \
      __atomic_link_error ();						      \
@@ -181,24 +129,20 @@ 
      if (sizeof (*mem) == 1)						      \
        __asm __volatile (lock "xaddb %b0, %1"				      \
 			 : "=q" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			 : "0" (__addval), "m" (*mem));			      \
      else if (sizeof (*mem) == 2)					      \
        __asm __volatile (lock "xaddw %w0, %1"				      \
 			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			 : "0" (__addval), "m" (*mem));			      \
      else if (sizeof (*mem) == 4)					      \
        __asm __volatile (lock "xaddl %0, %1"				      \
 			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			 : "0" (__addval), "m" (*mem));			      \
      else if (__HAVE_64B_ATOMICS)					      \
        __asm __volatile (lock "xaddq %q0, %1"				      \
 			 : "=r" (__result), "=m" (*mem)			      \
 			 : "0" ((int64_t) cast_to_integer (__addval)),     \
-			   "m" (*mem),					      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+			   "m" (*mem));					      \
      else								      \
        __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
      __result; })
@@ -206,14 +150,6 @@ 
 #define atomic_exchange_and_add(mem, value) \
   __sync_fetch_and_add (mem, value)
 
-#define __arch_exchange_and_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_exchange_and_add(mem, value) \
-  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
-				mem, value)
-
-
 #define __arch_add_body(lock, pfx, apfx, mem, value) \
   do {									      \
     if (__builtin_constant_p (value) && (value) == 1)			      \
@@ -223,24 +159,20 @@ 
     else if (sizeof (*mem) == 1)					      \
       __asm __volatile (lock "addb %b1, %0"				      \
 			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (value), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: IBR_CONSTRAINT (value), "m" (*mem));		      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "addw %w1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (value), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (value), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "addl %1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (value), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (value), "m" (*mem));			      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "addq %q1, %0"				      \
 			: "=m" (*mem)					      \
 			: "ir" ((int64_t) cast_to_integer (value)),	      \
-			  "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			  "m" (*mem));					      \
     else								      \
       do_add_val_64_acq (apfx, (mem), (value));				      \
   } while (0)
@@ -248,13 +180,6 @@ 
 # define atomic_add(mem, value) \
   __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
 
-#define __arch_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_add(mem, value) \
-  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
-
-
 #define atomic_add_negative(mem, value) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -308,36 +233,25 @@ 
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "incb %b0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "incw %w0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "incl %0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "incq %q0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else								      \
       do_add_val_64_acq (pfx, mem, 1);					      \
   } while (0)
 
 #define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
 
-#define __arch_increment_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_increment(mem) \
-  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
-
-
 #define atomic_increment_and_test(mem) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -366,36 +280,25 @@ 
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "decb %b0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "decw %w0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "decl %0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "decq %q0"					      \
 			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "m" (*mem));					      \
     else								      \
       do_add_val_64_acq (pfx, mem, -1);					      \
   } while (0)
 
 #define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
 
-#define __arch_decrement_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_decrement(mem) \
-  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
-
-
 #define atomic_decrement_and_test(mem) \
   ({ unsigned char __result;						      \
      if (sizeof (*mem) == 1)						      \
@@ -472,65 +375,49 @@ 
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "andb %b1, %0"				      \
 			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: IBR_CONSTRAINT (mask), "m" (*mem));		      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "andw %w1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "andl %1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "andq %q1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else								      \
       __atomic_link_error ();						      \
   } while (0)
 
-#define __arch_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
 #define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
 
-#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
-
-
 #define __arch_or_body(lock, mem, mask) \
   do {									      \
     if (sizeof (*mem) == 1)						      \
       __asm __volatile (lock "orb %b1, %0"				      \
 			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: IBR_CONSTRAINT (mask), "m" (*mem));		      \
     else if (sizeof (*mem) == 2)					      \
       __asm __volatile (lock "orw %w1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (sizeof (*mem) == 4)					      \
       __asm __volatile (lock "orl %1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else if (__HAVE_64B_ATOMICS)					      \
       __asm __volatile (lock "orq %q1, %0"				      \
 			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
+			: "ir" (mask), "m" (*mem));			      \
     else								      \
       __atomic_link_error ();						      \
   } while (0)
 
 #define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
 
-#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
-
 /* We don't use mfence because it is supposedly slower due to having to
    provide stronger guarantees (e.g., regarding self-modifying code).  */
 #define atomic_full_barrier() \