[6/6] malloc: Use tailcalls in __libc_free
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
Use tailcalls to avoid the overhead of a frame on the free fastpath.
Move tache initialization to _int_free_chunk(). Add malloc_printerr_tail()
which can be tailcalled without forcing a frame like no-return functions.
Change tcache_double_free_verify() to retry via __libc_free() after clearing
the key.
Overall performance of bench-malloc-thread improves by 29.9% for 1 thread
and 27.1% for 32 threads on Neoverse V2. Bench-malloc-simple improves by
5.9% on average (13% for single-threaded case).
Passes regress, OK for commit?
---
Comments
* Wilco Dijkstra:
> Use tailcalls to avoid the overhead of a frame on the free fastpath.
> Move tache initialization to _int_free_chunk(). Add malloc_printerr_tail()
typo: t[c]ache
> which can be tailcalled without forcing a frame like no-return functions.
> Change tcache_double_free_verify() to retry via __libc_free() after clearing
> the key.
>
> Overall performance of bench-malloc-thread improves by 29.9% for 1 thread
> and 27.1% for 32 threads on Neoverse V2. Bench-malloc-simple improves by
> 5.9% on average (13% for single-threaded case).
>
> Passes regress, OK for commit?
Rest looks okay to me (not sure if this qualifies as a review, sorry).
Thanks,
Florian
On 31/03/25 13:41, Florian Weimer wrote:
> * Wilco Dijkstra:
>
>> Use tailcalls to avoid the overhead of a frame on the free fastpath.
>> Move tache initialization to _int_free_chunk(). Add malloc_printerr_tail()
>
> typo: t[c]ache
>
>> which can be tailcalled without forcing a frame like no-return functions.
>> Change tcache_double_free_verify() to retry via __libc_free() after clearing
>> the key.
>>
>> Overall performance of bench-malloc-thread improves by 29.9% for 1 thread
>> and 27.1% for 32 threads on Neoverse V2. Bench-malloc-simple improves by
>> 5.9% on average (13% for single-threaded case).
>>
>> Passes regress, OK for commit?
>
> Rest looks okay to me (not sure if this qualifies as a review, sorry).
LGTM as well.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
@@ -1099,6 +1099,9 @@ static void* _int_memalign(mstate, size_t, size_t);
static void* _mid_memalign(size_t, size_t, void *);
#endif
+#if USE_TCACHE
+static void malloc_printerr_tail(const char *str);
+#endif
static void malloc_printerr(const char *str) __attribute__ ((noreturn));
static void munmap_chunk(mchunkptr p);
@@ -3238,9 +3241,12 @@ tcache_double_free_verify (tcache_entry *e, size_t tc_idx)
malloc_printerr ("free(): unaligned chunk detected in tcache 2");
if (tmp == e)
malloc_printerr ("free(): double free detected in tcache 2");
- /* If we get here, it was a coincidence. We've wasted a
- few cycles, but don't abort. */
}
+ /* No double free detected - it might be in a tcache of another thread,
+ or user data that happens to match the key. Since we are not sure,
+ clear the key and retry freeing it. */
+ e->key = 0;
+ __libc_free (e);
}
static void
@@ -3430,15 +3436,13 @@ __libc_free (void *mem)
mchunkptr p = mem2chunk (mem);
- MAYBE_INIT_TCACHE ();
-
/* Mark the chunk as belonging to the library again. */
(void)tag_region (chunk2mem (p), memsize (p));
INTERNAL_SIZE_T size = chunksize (p);
if (__glibc_unlikely (misaligned_chunk (p)))
- malloc_printerr ("free(): invalid pointer");
+ return malloc_printerr_tail ("free(): invalid pointer");
check_inuse_chunk (arena_for_chunk (p), p);
@@ -3452,7 +3456,7 @@ __libc_free (void *mem)
/* Check for double free - verify if the key matches. */
if (__glibc_unlikely (e->key == tcache_key))
- tcache_double_free_verify (e, tc_idx);
+ return tcache_double_free_verify (e, tc_idx);
if (__glibc_likely (tcache->counts[tc_idx] < mp_.tcache_count))
return tcache_put (p, tc_idx);
@@ -3462,7 +3466,7 @@ __libc_free (void *mem)
/* Check size >= MINSIZE and p + size does not overflow. */
if (__glibc_unlikely (__builtin_add_overflow_p ((uintptr_t) p, size - MINSIZE,
(uintptr_t) 0)))
- malloc_printerr ("free(): invalid size");
+ return malloc_printerr_tail ("free(): invalid size");
_int_free_chunk (arena_for_chunk (p), p, size, 0);
}
@@ -4549,6 +4553,8 @@ _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
{
mfastbinptr *fb; /* associated fastbin */
+ MAYBE_INIT_TCACHE ();
+
/*
If eligible, place chunk on a fastbin so it can be found
and used quickly in malloc.
@@ -5810,6 +5816,18 @@ malloc_printerr (const char *str)
__builtin_unreachable ();
}
+#if USE_TCACHE
+static __attribute_noinline__ void
+malloc_printerr_tail (const char *str)
+{
+ /* Ensure this cannot be a no-return function. */
+ if (!__malloc_initialized)
+ return;
+ malloc_printerr (str);
+}
+#endif
+
+
#if IS_IN (libc)
/* We need a wrapper function for one of the additions of POSIX. */
int