[v5,6/7] malloc: add tcache support for large chunk caching

Message ID 20250321120102.54012-7-cupertino.miranda@oracle.com (mailing list archive)
State Under Review
Delegated to: Wilco Dijkstra
Headers
Series tcache: malloc improvements |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Cupertino Miranda March 21, 2025, 12:01 p.m. UTC
  Existing tcache implementation in glibc seems to focus in caching
smaller data size allocations, limiting the size of the allocation to
1KB.

This patch changes tcache implementation to allow to cache any chunk
size allocations.  The implementation adds extra bins (linked-lists)
which store chunks with different ranges of allocation sizes. Bin
selection is done in multiples in powers of 2 and chunks are inserted in
growing size ordering within the bin.  The last bin contains all other
sizes of allocations.

This patch although by default preserves the same implementation,
limitting caches to 1KB chunks, it now allows to increase the max size
for the cached chunks with the tunable glibc.malloc.tcache_max.
---
 malloc/malloc.c | 134 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 93 insertions(+), 41 deletions(-)
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index c942e3f8d1..cf45ceaebc 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -292,13 +292,19 @@ 
 #if USE_TCACHE
 /* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
 # define TCACHE_MAX_BINS		64
+/* Number of bins beyond the TCACHE_MAX_BINS reserved for ranges of chunksizes
+   to allow tcaches to cache chunks beyond ~1k size.  */
+# define TCACHE_UNBOUND_SIZE_BINS	10
+# define TCACHE_ALL_BINS	(TCACHE_MAX_BINS + TCACHE_UNBOUND_SIZE_BINS)
 # define MAX_TCACHE_SIZE	tidx2usize (TCACHE_MAX_BINS-1)
+# define TCACHE_FIXED_SIZE_BINS	\
+	   (mp_.tcache_bins < TCACHE_MAX_BINS ? mp_.tcache_bins : TCACHE_MAX_BINS)
 
 /* Only used to pre-fill the tunables.  */
-# define tidx2usize(idx)	(((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)
+# define tidx2usize(idx)	(((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - MALLOC_ALIGNMENT + 1)
 
-/* When "x" is from chunksize().  */
-# define csize2tidx(x) (((x) - MINSIZE) / MALLOC_ALIGNMENT)
+/* When "x" is from chunksize(). Only usable for single sized tcache bins.  */
+# define fast_csize2tidx(x) (((x) - MINSIZE) / MALLOC_ALIGNMENT)
 /* When "x" is a user-provided size.  */
 # define usize2tidx(x) csize2tidx (request2size (x))
 
@@ -1926,7 +1932,7 @@  static struct malloc_par mp_ =
   ,
   .tcache_count = TCACHE_FILL_COUNT,
   .tcache_bins = TCACHE_MAX_BINS,
-  .tcache_max_bytes = tidx2usize (TCACHE_MAX_BINS-1),
+  .tcache_max_bytes = MAX_TCACHE_SIZE,
   .tcache_unsorted_limit = 0 /* No limit.  */
 #endif
 };
@@ -3119,8 +3125,8 @@  typedef struct tcache_entry
    time), this is for performance reasons.  */
 typedef struct tcache_perthread_struct
 {
-  uint16_t counts[TCACHE_MAX_BINS];
-  tcache_entry *entries[TCACHE_MAX_BINS];
+  uint16_t counts[TCACHE_MAX_BINS + TCACHE_UNBOUND_SIZE_BINS];
+  tcache_entry *entries[TCACHE_MAX_BINS + TCACHE_UNBOUND_SIZE_BINS];
 } tcache_perthread_struct;
 
 static __thread bool tcache_shutting_down = false;
@@ -3153,6 +3159,45 @@  tcache_key_initialize (void)
     }
 }
 
+static __always_inline size_t
+csize2tidx(size_t nb)
+{
+  if (__glibc_likely (nb < tidx2usize (TCACHE_MAX_BINS)))
+    return fast_csize2tidx(nb);
+  else if (__glibc_likely (mp_.tcache_bins < TCACHE_MAX_BINS))
+    return TCACHE_MAX_BINS;
+  else
+    {
+      size_t idx = TCACHE_MAX_BINS
+	      + __builtin_clz (tidx2usize (TCACHE_MAX_BINS-1))
+	      - __builtin_clz (nb);
+      return idx < TCACHE_ALL_BINS ? idx : TCACHE_ALL_BINS - 1;
+    }
+}
+
+/* Returns the position in the tcache bin where the specified size is first found in.
+   This function is used by both tcache_get and tcache_put to locate the proper
+   position in the tcache bin linked-list.  */
+static __always_inline tcache_entry **
+tcache_location_for_size (size_t nb, size_t tc_idx)
+{
+  if (__glibc_likely (nb <= tidx2usize (TCACHE_MAX_BINS-1)))
+    return &(tcache->entries[tc_idx]);
+  else
+    {
+      tcache_entry **tep = &(tcache->entries[tc_idx]);
+      tcache_entry *te = REVEAL_PTR (*tep);
+      while (te != NULL
+             && __glibc_unlikely (chunksize (mem2chunk (te)) < nb))
+        {
+          tep = & (te->next);
+          te = REVEAL_PTR (te->next);
+        }
+
+      return tep;
+    }
+}
+
 /* Caller must ensure that we know tc_idx is valid and there's room
    for more chunks.  */
 static __always_inline void
@@ -3164,8 +3209,9 @@  tcache_put (mchunkptr chunk, size_t tc_idx)
      detect a double free.  */
   e->key = tcache_key;
 
-  e->next = PROTECT_PTR (&e->next, REVEAL_PTR (tcache->entries[tc_idx]));
-  tcache->entries[tc_idx] = PROTECT_PTR (&(tcache->entries[tc_idx]), e);
+  tcache_entry **entry = tcache_location_for_size (chunksize (chunk), tc_idx);
+  e->next = PROTECT_PTR (&e->next, REVEAL_PTR (*entry));
+  *entry = PROTECT_PTR (entry, e);
   ++(tcache->counts[tc_idx]);
 }
 
@@ -3173,7 +3219,7 @@  tcache_put (mchunkptr chunk, size_t tc_idx)
    available chunks to remove.  Removes chunk from the middle of the
    list.  */
 static __always_inline void *
-tcache_get_n (size_t tc_idx, tcache_entry **ep)
+tcache_remove_entry (size_t tc_idx, tcache_entry **ep)
 {
   tcache_entry *e = REVEAL_PTR (*ep);
 
@@ -3187,18 +3233,18 @@  tcache_get_n (size_t tc_idx, tcache_entry **ep)
   return (void *) e;
 }
 
-/* Like the above, but removes from the head of the list.  */
+/* Like the above, but traverses the list to find proper location to get the
+   entry within the respective tcache bin.  */
 static __always_inline void *
-tcache_get (size_t tc_idx)
+tcache_get (size_t nb, size_t tc_idx)
 {
-  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
-}
+  tcache_entry **entry = tcache_location_for_size (nb, tc_idx);
+  tcache_entry *e = REVEAL_PTR (*entry);
+  if (__glibc_unlikely (tc_idx >= TCACHE_MAX_BINS)
+      && (e == NULL || chunksize (mem2chunk (e)) != nb))
+    return NULL;
 
-/* Iterates through the tcache linked list.  */
-static __always_inline tcache_entry *
-tcache_next (tcache_entry *e)
-{
-  return (tcache_entry *) REVEAL_PTR (e->next);
+  return tcache_remove_entry (tc_idx, entry);
 }
 
 /* Check if tcache is available for alloc by corresponding tc_idx.  */
@@ -3281,7 +3327,7 @@  tcache_thread_shutdown (void)
 
   /* Free all of the entries and the tcache itself back to the arena
      heap for coalescing.  */
-  for (i = 0; i < TCACHE_MAX_BINS; ++i)
+  for (i = 0; i < TCACHE_MAX_BINS + TCACHE_UNBOUND_SIZE_BINS; ++i)
     {
       while (REVEAL_PTR (tcache_tmp->entries[i]))
 	{
@@ -3330,7 +3376,7 @@  tcache_init(void)
       memset (tcache, 0, sizeof (tcache_perthread_struct));
     }
 
-  for (int i = 0; i < mp_.tcache_bins; i++)
+  for (int i = 0; i < TCACHE_MAX_BINS + TCACHE_UNBOUND_SIZE_BINS; i++)
     tcache->entries[i] = PROTECT_PTR (&(tcache->entries[i]), NULL);
 }
 
@@ -3357,7 +3403,7 @@  tcache_try_malloc (size_t bytes, void **memptr)
   MAYBE_INIT_TCACHE ();
 
   if (tcache_available (tc_idx))
-    *memptr = tcache_get (tc_idx);
+    *memptr = tcache_get (tbytes, tc_idx);
   else
     *memptr = NULL;
 
@@ -3692,20 +3738,28 @@  _mid_memalign (size_t alignment, size_t bytes, void *address)
 
     if (tcache_available (tc_idx))
       {
-	/* The tcache itself isn't encoded, but the chain is.  */
-	tcache_entry **tep = & tcache->entries[tc_idx];
+	tcache_entry **tep = tcache_location_for_size (tbytes, tc_idx);
 	tcache_entry *te = REVEAL_PTR (*tep);
+
+	/* Make sure returned chunk is of expected size.  */
+	if (te == NULL
+	    || (tc_idx >= TCACHE_MAX_BINS
+		&& chunksize (mem2chunk (te)) != tbytes))
+	  goto tcache_memalign_abort;
+
+	/* The tcache itself isn't encoded, but the chain is.  */
 	while (te != NULL && !PTR_IS_ALIGNED (te, alignment))
 	  {
 	    tep = & (te->next);
-	    te = tcache_next (te);
+	    te = REVEAL_PTR (te->next);
 	  }
 	if (te != NULL)
 	  {
-	    void *victim = tcache_get_n (tc_idx, tep);
+	    void *victim = tcache_remove_entry (tc_idx, tep);
 	    return tag_new_usable (victim);
 	  }
       }
+tcache_memalign_abort:
   }
 #endif
 
@@ -3992,8 +4046,8 @@  _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
 	      /* While we're here, if we see other chunks of the same size,
 		 stash them in the tcache.  */
-	      size_t tc_idx = csize2tidx (nb);
-	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
+	      size_t tc_idx = fast_csize2tidx (nb);
+	      if (tcache != NULL && tc_idx < TCACHE_FIXED_SIZE_BINS)
 		{
 		  mchunkptr tc_victim;
 
@@ -4053,8 +4107,8 @@  _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
 	  /* While we're here, if we see other chunks of the same size,
 	     stash them in the tcache.  */
-	  size_t tc_idx = csize2tidx (nb);
-	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
+	  size_t tc_idx = fast_csize2tidx (nb);
+	  if (__glibc_likely (tcache != NULL) && tc_idx < TCACHE_FIXED_SIZE_BINS)
 	    {
 	      mchunkptr tc_victim;
 
@@ -4115,8 +4169,8 @@  _int_malloc (mstate av, size_t bytes)
 
 #if USE_TCACHE
   INTERNAL_SIZE_T tcache_nb = 0;
-  size_t tc_idx = csize2tidx (nb);
-  if (tcache != NULL && tc_idx < mp_.tcache_bins)
+  size_t tc_idx = fast_csize2tidx (nb);
+  if (tcache != NULL && tc_idx < TCACHE_FIXED_SIZE_BINS)
     tcache_nb = nb;
   int return_cached = 0;
 
@@ -4294,7 +4348,7 @@  _int_malloc (mstate av, size_t bytes)
 	  && mp_.tcache_unsorted_limit > 0
 	  && tcache_unsorted_count > mp_.tcache_unsorted_limit)
 	{
-	  return tcache_get (tc_idx);
+	  return tcache_get (nb, tc_idx);
 	}
 #endif
 
@@ -4307,7 +4361,7 @@  _int_malloc (mstate av, size_t bytes)
       /* If all the small chunks we found ended up cached, return one now.  */
       if (return_cached)
 	{
-	  return tcache_get (tc_idx);
+	  return tcache_get (nb, tc_idx);
 	}
 #endif
 
@@ -5553,14 +5607,12 @@  do_set_arena_max (size_t value)
 static __always_inline int
 do_set_tcache_max (size_t value)
 {
-  if (value <= MAX_TCACHE_SIZE)
-    {
-      LIBC_PROBE (memory_tunable_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
-      mp_.tcache_max_bytes = value;
-      mp_.tcache_bins = csize2tidx (request2size(value)) + 1;
-      return 1;
-    }
-  return 0;
+  LIBC_PROBE (memory_tunable_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
+  mp_.tcache_max_bytes = value;
+  mp_.tcache_bins = TCACHE_MAX_BINS + TCACHE_UNBOUND_SIZE_BINS;
+  mp_.tcache_bins = csize2tidx (request2size(value)) + 1;
+
+  return 1;
 }
 
 static __always_inline int