[v3,5/5] malloc: Add tcache path for calloc

Message ID 20240829062732.1663342-6-wangyang.guo@intel.com
State Under Review
Delegated to: Florian Weimer
Headers
Series malloc: TCACHE improvement for free and calloc |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed

Commit Message

Guo, Wangyang Aug. 29, 2024, 6:27 a.m. UTC
  This commit add tcache support in calloc() which can largely improve
the performance of small size allocation, especially in multi-thread
scenario. clear_mem() and tcache_available() is split out as a helper
function for better reusing the code.

Also fix tst-safe-linking failure after enabling tcache. In previous,
calloc() is used as a way to by-pass tcache in memory allocation and
trigger safe-linking check in fastbins path. With tcache enabled, it
needs extra workarounds to bypass tcache.

Result of bench-malloc-thread benchmark

Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)

Threads#   | Ratio
-----------|------
1 thread   | 0.724
4 threads  | 0.534

---
Changes in v3:
- Split out tcache_available() as helper function.
- Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
Changes in v2:
- Merge tst-safe-linking fix to make sure CI check pass.
- Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
---
 malloc/malloc.c           | 129 ++++++++++++++++++++++++--------------
 malloc/tst-safe-linking.c |  81 ++++++++++++++++++++----
 2 files changed, 150 insertions(+), 60 deletions(-)
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index efb5292e9f..f9dd29cf64 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3211,6 +3211,18 @@  tcache_next (tcache_entry *e)
   return (tcache_entry *) REVEAL_PTR (e->next);
 }
 
+/* Check if tcache is available for alloc by corresponding tc_idx.  */
+static __always_inline bool
+tcache_availabe (size_t tc_idx)
+{
+  if (tc_idx < mp_.tcache_bins
+      && tcache != NULL
+      && tcache->counts[tc_idx] > 0)
+    return true;
+  else
+    return false;
+}
+
 /* Verify if the suspicious tcache_entry is double free.
    It's not expected to execute very often, mark it as noinline.  */
 static __attribute__ ((noinline)) void
@@ -3369,9 +3381,7 @@  __libc_malloc (size_t bytes)
   MAYBE_INIT_TCACHE ();
 
   DIAG_PUSH_NEEDS_COMMENT;
-  if (tc_idx < mp_.tcache_bins
-      && tcache != NULL
-      && tcache->counts[tc_idx] > 0)
+  if (tcache_availabe (tc_idx))
     {
       victim = tcache_get (tc_idx);
       return tag_new_usable (victim);
@@ -3683,9 +3693,7 @@  _mid_memalign (size_t alignment, size_t bytes, void *address)
       }
     size_t tc_idx = csize2tidx (tbytes);
 
-    if (tc_idx < mp_.tcache_bins
-	&& tcache != NULL
-	&& tcache->counts[tc_idx] > 0)
+    if (tcache_availabe (tc_idx))
       {
 	/* The tcache itself isn't encoded, but the chain is.  */
 	tcache_entry **tep = & tcache->entries[tc_idx];
@@ -3763,16 +3771,55 @@  __libc_pvalloc (size_t bytes)
   return _mid_memalign (pagesize, rounded_bytes, address);
 }
 
+static __always_inline void *
+clear_mem (void *mem, INTERNAL_SIZE_T csz)
+{
+  INTERNAL_SIZE_T *d;
+  unsigned long clearsize, nclears;
+
+  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
+     contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3.  */
+  d = (INTERNAL_SIZE_T *) mem;
+  clearsize = csz - SIZE_SZ;
+  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+  assert (nclears >= 3);
+
+  if (nclears > 9)
+    return memset (d, 0, clearsize);
+
+  else
+    {
+      *(d + 0) = 0;
+      *(d + 1) = 0;
+      *(d + 2) = 0;
+      if (nclears > 4)
+        {
+          *(d + 3) = 0;
+          *(d + 4) = 0;
+          if (nclears > 6)
+            {
+              *(d + 5) = 0;
+              *(d + 6) = 0;
+              if (nclears > 8)
+                {
+                  *(d + 7) = 0;
+                  *(d + 8) = 0;
+                }
+            }
+        }
+    }
+
+  return mem;
+}
+
 void *
 __libc_calloc (size_t n, size_t elem_size)
 {
   mstate av;
-  mchunkptr oldtop;
-  INTERNAL_SIZE_T sz, oldtopsize;
+  mchunkptr oldtop, p;
+  INTERNAL_SIZE_T sz, oldtopsize, csz;
   void *mem;
-  unsigned long clearsize;
-  unsigned long nclears;
-  INTERNAL_SIZE_T *d;
   ptrdiff_t bytes;
 
   if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3788,6 +3835,27 @@  __libc_calloc (size_t n, size_t elem_size)
 
   MAYBE_INIT_TCACHE ();
 
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = checked_request2size (bytes);
+  if (tbytes == 0)
+    {
+      __set_errno (ENOMEM);
+      return NULL;
+    }
+  size_t tc_idx = csize2tidx (tbytes);
+
+  if (tcache_availabe (tc_idx))
+    {
+      mem = tcache_get (tc_idx);
+      p = mem2chunk (mem);
+      if (__glibc_unlikely (mtag_enabled))
+	return tag_new_zero_region (mem, memsize (p));
+      csz = chunksize (p);
+      return clear_mem (mem, csz);
+    }
+#endif
+
   if (SINGLE_THREAD_P)
     av = &main_arena;
   else
@@ -3842,7 +3910,7 @@  __libc_calloc (size_t n, size_t elem_size)
   if (mem == 0)
     return 0;
 
-  mchunkptr p = mem2chunk (mem);
+  p = mem2chunk (mem);
 
   /* If we are using memory tagging, then we need to set the tags
      regardless of MORECORE_CLEARS, so we zero the whole block while
@@ -3850,7 +3918,7 @@  __libc_calloc (size_t n, size_t elem_size)
   if (__glibc_unlikely (mtag_enabled))
     return tag_new_zero_region (mem, memsize (p));
 
-  INTERNAL_SIZE_T csz = chunksize (p);
+  csz = chunksize (p);
 
   /* Two optional cases in which clearing not necessary */
   if (chunk_is_mmapped (p))
@@ -3869,40 +3937,7 @@  __libc_calloc (size_t n, size_t elem_size)
     }
 #endif
 
-  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
-     contents have an odd number of INTERNAL_SIZE_T-sized words;
-     minimally 3.  */
-  d = (INTERNAL_SIZE_T *) mem;
-  clearsize = csz - SIZE_SZ;
-  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
-  assert (nclears >= 3);
-
-  if (nclears > 9)
-    return memset (d, 0, clearsize);
-
-  else
-    {
-      *(d + 0) = 0;
-      *(d + 1) = 0;
-      *(d + 2) = 0;
-      if (nclears > 4)
-        {
-          *(d + 3) = 0;
-          *(d + 4) = 0;
-          if (nclears > 6)
-            {
-              *(d + 5) = 0;
-              *(d + 6) = 0;
-              if (nclears > 8)
-                {
-                  *(d + 7) = 0;
-                  *(d + 8) = 0;
-                }
-            }
-        }
-    }
-
-  return mem;
+  return clear_mem (mem, csz);
 }
 #endif /* IS_IN (libc) */
 
diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
index 01dd07004d..5302575ad1 100644
--- a/malloc/tst-safe-linking.c
+++ b/malloc/tst-safe-linking.c
@@ -111,22 +111,37 @@  test_fastbin (void *closure)
   int i;
   int mask = ((int *)closure)[0];
   size_t size = TCACHE_ALLOC_SIZE;
+  void * ps[TCACHE_FILL_COUNT];
+  void * pps[TCACHE_FILL_COUNT];
 
   printf ("++ fastbin ++\n");
 
+  /* Populate the fastbin list.  */
+  void * volatile a = calloc (1, size);
+  void * volatile b = calloc (1, size);
+  void * volatile c = calloc (1, size);
+  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+
+  /* Chunks for later tcache filling from fastbins.  */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      void * volatile p = calloc (1, size);
+      pps[i] = p;
+    }
+
   /* Take the tcache out of the game.  */
   for (i = 0; i < TCACHE_FILL_COUNT; ++i)
     {
       void * volatile p = calloc (1, size);
-      printf ("p=%p\n", p);
-      free (p);
+      ps[i] = p;
     }
 
-  /* Populate the fastbin list.  */
-  void * volatile a = calloc (1, size);
-  void * volatile b = calloc (1, size);
-  void * volatile c = calloc (1, size);
-  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (ps[i]);
+    }
+
+  /* Free abc will return to fastbin in FIFO order.  */
   free (a);
   free (b);
   free (c);
@@ -136,11 +151,43 @@  test_fastbin (void *closure)
   memset (c, mask & 0xFF, size);
   printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
 
+  /* Filling fastbins, will be copied to tcache later.  */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (pps[i]);
+    }
+
+  /* Drain out tcache to make sure later alloc from fastbins.  */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      void * volatile p = calloc (1, size);
+      ps[i] = p;
+    }
+
+  /* This line will also filling tcache with remain pps and c.  */
+  pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
+
+  /* Tcache is FILO, now the first one is c, take it out.  */
   c = calloc (1, size);
   printf ("Allocated: c=%p\n", c);
+
+  /* Drain out remain pps from tcache.  */
+  for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
+    {
+      void * volatile p = calloc (1, size);
+      pps[i] = p;
+    }
+
   /* This line will trigger the Safe-Linking check.  */
   b = calloc (1, size);
   printf ("b=%p\n", b);
+
+  /* Free previous pointers. */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (ps[i]);
+      free (pps[i]);
+    }
 }
 
 /* Try corrupting the fastbin list and trigger a consolidate.  */
@@ -150,21 +197,29 @@  test_fastbin_consolidate (void *closure)
   int i;
   int mask = ((int*)closure)[0];
   size_t size = TCACHE_ALLOC_SIZE;
+  void * ps[TCACHE_FILL_COUNT];
 
   printf ("++ fastbin consolidate ++\n");
 
+  /* Populate the fastbin list.  */
+  void * volatile a = calloc (1, size);
+  void * volatile b = calloc (1, size);
+  void * volatile c = calloc (1, size);
+  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+
   /* Take the tcache out of the game.  */
   for (i = 0; i < TCACHE_FILL_COUNT; ++i)
     {
       void * volatile p = calloc (1, size);
-      free (p);
+      ps[i] = p;
     }
 
-  /* Populate the fastbin list.  */
-  void * volatile a = calloc (1, size);
-  void * volatile b = calloc (1, size);
-  void * volatile c = calloc (1, size);
-  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (ps[i]);
+    }
+
+  /* Free abc will return to fastbin.  */
   free (a);
   free (b);
   free (c);