[6/6] malloc: Use tailcalls in __libc_free

Message ID PAWPR08MB8982B6D887748D7E7CD7735A83AD2@PAWPR08MB8982.eurprd08.prod.outlook.com (mailing list archive)
State New
Headers
Series [1/6] malloc: Move mmap code out of __libc_free hotpath |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed

Commit Message

Wilco Dijkstra March 31, 2025, 2:37 p.m. UTC
  Use tailcalls to avoid the overhead of a frame on the free fastpath.
Move tache initialization to _int_free_chunk().  Add malloc_printerr_tail()
which can be tailcalled without forcing a frame like no-return functions.
Change tcache_double_free_verify() to retry via __libc_free() after clearing
the key.

Overall performance of bench-malloc-thread improves by 29.9% for 1 thread
and 27.1% for 32 threads on Neoverse V2.  Bench-malloc-simple improves by
5.9% on average (13% for single-threaded case).

Passes regress, OK for commit?

---
  

Comments

Florian Weimer March 31, 2025, 4:41 p.m. UTC | #1
* Wilco Dijkstra:

> Use tailcalls to avoid the overhead of a frame on the free fastpath.
> Move tache initialization to _int_free_chunk().  Add malloc_printerr_tail()

typo: t[c]ache

> which can be tailcalled without forcing a frame like no-return functions.
> Change tcache_double_free_verify() to retry via __libc_free() after clearing
> the key.
>
> Overall performance of bench-malloc-thread improves by 29.9% for 1 thread
> and 27.1% for 32 threads on Neoverse V2.  Bench-malloc-simple improves by
> 5.9% on average (13% for single-threaded case).
>
> Passes regress, OK for commit?

Rest looks okay to me (not sure if this qualifies as a review, sorry).

Thanks,
Florian
  
Adhemerval Zanella Netto April 14, 2025, 2:21 p.m. UTC | #2
On 31/03/25 13:41, Florian Weimer wrote:
> * Wilco Dijkstra:
> 
>> Use tailcalls to avoid the overhead of a frame on the free fastpath.
>> Move tache initialization to _int_free_chunk().  Add malloc_printerr_tail()
> 
> typo: t[c]ache
> 
>> which can be tailcalled without forcing a frame like no-return functions.
>> Change tcache_double_free_verify() to retry via __libc_free() after clearing
>> the key.
>>
>> Overall performance of bench-malloc-thread improves by 29.9% for 1 thread
>> and 27.1% for 32 threads on Neoverse V2.  Bench-malloc-simple improves by
>> 5.9% on average (13% for single-threaded case).
>>
>> Passes regress, OK for commit?
> 
> Rest looks okay to me (not sure if this qualifies as a review, sorry).

LGTM as well.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 46383c4909c8c4bb16c02b00573c23d149ecd69d..2fec3578275c444e95b9ee25daddd981228c88c8 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1099,6 +1099,9 @@  static void*  _int_memalign(mstate, size_t, size_t);
 static void*  _mid_memalign(size_t, size_t, void *);
 #endif
 
+#if USE_TCACHE
+static void malloc_printerr_tail(const char *str);
+#endif
 static void malloc_printerr(const char *str) __attribute__ ((noreturn));
 
 static void munmap_chunk(mchunkptr p);
@@ -3238,9 +3241,12 @@  tcache_double_free_verify (tcache_entry *e, size_t tc_idx)
 	malloc_printerr ("free(): unaligned chunk detected in tcache 2");
       if (tmp == e)
 	malloc_printerr ("free(): double free detected in tcache 2");
-      /* If we get here, it was a coincidence.  We've wasted a
-	 few cycles, but don't abort.  */
     }
+  /* No double free detected - it might be in a tcache of another thread,
+     or user data that happens to match the key.  Since we are not sure,
+     clear the key and retry freeing it.  */
+  e->key = 0;
+  __libc_free (e);
 }
 
 static void
@@ -3430,15 +3436,13 @@  __libc_free (void *mem)
 
   mchunkptr p = mem2chunk (mem);
 
-  MAYBE_INIT_TCACHE ();
-
   /* Mark the chunk as belonging to the library again.  */
   (void)tag_region (chunk2mem (p), memsize (p));
 
   INTERNAL_SIZE_T size = chunksize (p);
 
   if (__glibc_unlikely (misaligned_chunk (p)))
-    malloc_printerr ("free(): invalid pointer");
+    return malloc_printerr_tail ("free(): invalid pointer");
 
   check_inuse_chunk (arena_for_chunk (p), p);
 
@@ -3452,7 +3456,7 @@  __libc_free (void *mem)
 
       /* Check for double free - verify if the key matches.  */
       if (__glibc_unlikely (e->key == tcache_key))
-        tcache_double_free_verify (e, tc_idx);
+        return tcache_double_free_verify (e, tc_idx);
 
       if (__glibc_likely (tcache->counts[tc_idx] < mp_.tcache_count))
         return tcache_put (p, tc_idx);
@@ -3462,7 +3466,7 @@  __libc_free (void *mem)
   /* Check size >= MINSIZE and p + size does not overflow.  */
   if (__glibc_unlikely (__builtin_add_overflow_p ((uintptr_t) p, size - MINSIZE,
 						  (uintptr_t) 0)))
-    malloc_printerr ("free(): invalid size");
+    return malloc_printerr_tail ("free(): invalid size");
 
   _int_free_chunk (arena_for_chunk (p), p, size, 0);
 }
@@ -4549,6 +4553,8 @@  _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
 {
   mfastbinptr *fb;             /* associated fastbin */
 
+  MAYBE_INIT_TCACHE ();
+
   /*
     If eligible, place chunk on a fastbin so it can be found
     and used quickly in malloc.
@@ -5810,6 +5816,18 @@  malloc_printerr (const char *str)
   __builtin_unreachable ();
 }
 
+#if USE_TCACHE
+static __attribute_noinline__ void
+malloc_printerr_tail (const char *str)
+{
+  /* Ensure this cannot be a no-return function.  */
+  if (!__malloc_initialized)
+    return;
+  malloc_printerr (str);
+}
+#endif
+
+
 #if IS_IN (libc)
 /* We need a wrapper function for one of the additions of POSIX.  */
 int