malloc: Count tcache entries downwards
Checks
| Context |
Check |
Description |
| redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
| linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
| linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
| linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
| linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
| redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Currently tcache requires 2 global variable accesses to determine
whether a block can be added to the tcache. Change the counts array
to indicate the number of entries that could be added by counting
downwards. If the count reaches zero, no more blocks can be added.
If the entries pointer is not NULL, at least one block is available
for allocation.
Now each tcache bin can support a different maximum number of blocks,
and they can be individually switched on or off (a zero initialized
count+entry means the tcache bin is not available for free or malloc).
Performance is neutral on current trunk (+1.4% on malloc-bench-thread 32),
but after the outstanding improvements to free it gives ~5%.
---
Comments
Hi Wilco,
On 10-04-2025 14:43, Wilco Dijkstra wrote:
>
> Currently tcache requires 2 global variable accesses to determine
> whether a block can be added to the tcache. Change the counts array
> to indicate the number of entries that could be added by counting
> downwards. If the count reaches zero, no more blocks can be added.
> If the entries pointer is not NULL, at least one block is available
> for allocation.
>
> Now each tcache bin can support a different maximum number of blocks,
> and they can be individually switched on or off (a zero initialized
> count+entry means the tcache bin is not available for free or malloc).
>
> Performance is neutral on current trunk (+1.4% on malloc-bench-thread 32),
> but after the outstanding improvements to free it gives ~5%.
>
> ---
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index a0bc733482532ce34684d0357cb9076b03ac8a52..c3e46518c5335467bb9b6ba0f0e96c4d18ee25e6 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3169,7 +3169,7 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>
> e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
> tcache->entries[tc_idx] = e;
> - ++(tcache->counts[tc_idx]);
> + --(tcache->counts[tc_idx]);
> }
>
> /* Caller must ensure that we know tc_idx is valid and there's
> @@ -3192,7 +3192,7 @@ tcache_get_n (size_t tc_idx, tcache_entry **ep)
> else
> *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
>
> - --(tcache->counts[tc_idx]);
> + ++(tcache->counts[tc_idx]);
> e->key = 0;
> return (void *) e;
> }
> @@ -3217,7 +3217,7 @@ tcache_available (size_t tc_idx)
> {
> if (tc_idx < mp_.tcache_bins
> && tcache != NULL
> - && tcache->counts[tc_idx] > 0)
> + && tcache->entries[tc_idx] != NULL)
This will not be Ok for the pointer sizzling, which sizzles the entries.
Why not check for:
tcache->counts[tc_idx] != 0
> return true;
> else
> return false;
> @@ -3265,7 +3265,7 @@ tcache_free (mchunkptr p, INTERNAL_SIZE_T size)
> if (__glibc_unlikely (e->key == tcache_key))
> tcache_double_free_verify (e, tc_idx);
>
> - if (tcache->counts[tc_idx] < mp_.tcache_count)
> + if (tcache->counts[tc_idx] != 0)
You do it here.
> {
> tcache_put (p, tc_idx);
> done = true;
> @@ -3337,6 +3337,8 @@ tcache_init(void)
> {
> tcache = (tcache_perthread_struct *) victim;
> memset (tcache, 0, sizeof (tcache_perthread_struct));
> + for (int i = 0; i < TCACHE_MAX_BINS; i++)
> + tcache->counts[i] = mp_.tcache_count;
> }
>
> }
> @@ -4006,8 +4008,7 @@ _int_malloc (mstate av, size_t bytes)
> mchunkptr tc_victim;
>
> /* While bin not empty and tcache not full, copy chunks. */
> - while (tcache->counts[tc_idx] < mp_.tcache_count
> - && (tc_victim = *fb) != NULL)
> + while (tcache->counts[tc_idx] != 0 && (tc_victim = *fb) != NULL)
> {
> if (__glibc_unlikely (misaligned_chunk (tc_victim)))
> malloc_printerr ("malloc(): unaligned fastbin chunk detected 3");
> @@ -4067,8 +4068,7 @@ _int_malloc (mstate av, size_t bytes)
> mchunkptr tc_victim;
>
> /* While bin not empty and tcache not full, copy chunks over. */
> - while (tcache->counts[tc_idx] < mp_.tcache_count
> - && (tc_victim = last (bin)) != bin)
> + while (tcache->counts[tc_idx] != 0 && (tc_victim = last (bin)) != bin)
> {
> if (tc_victim != NULL)
> {
> @@ -4204,8 +4204,7 @@ _int_malloc (mstate av, size_t bytes)
> #if USE_TCACHE
> /* Fill cache first, return to user only if cache fills.
> We may return one of these chunks later. */
> - if (tcache_nb > 0
> - && tcache->counts[tc_idx] < mp_.tcache_count)
> + if (tcache_nb > 0 && tcache->counts[tc_idx] != 0)
> {
> tcache_put (victim, tc_idx);
> return_cached = 1;
>
>
Hi Cupertino,
> - && tcache->counts[tc_idx] > 0)
> + && tcache->entries[tc_idx] != NULL)
> This will not be Ok for the pointer sizzling, which sizzles the entries.
> Why not check for:
> tcache->counts[tc_idx] != 0
That checks for tcache not full rather than not empty, so it won't work.
Pointer swizzling would need an extra swizzle here.
>> - if (tcache->counts[tc_idx] < mp_.tcache_count)
>> + if (tcache->counts[tc_idx] != 0)
> You do it here.
This checks whether tcache is not full.
There are alternative schemes possible, eg. using combined
dual counters that can give a zero for both full and empty,
but those are more complex than this.
Cheers,
Wilco
On 4/11/25 6:52 AM, Wilco Dijkstra wrote:
> Hi Cupertino,
>
>> - && tcache->counts[tc_idx] > 0)
>> + && tcache->entries[tc_idx] != NULL)
>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>> Why not check for:
>> tcache->counts[tc_idx] != 0
>
> That checks for tcache not full rather than not empty, so it won't work.
> Pointer swizzling would need an extra swizzle here.
>
>>> - if (tcache->counts[tc_idx] < mp_.tcache_count)
>>> + if (tcache->counts[tc_idx] != 0)
>> You do it here.
>
> This checks whether tcache is not full.
>
> There are alternative schemes possible, eg. using combined
> dual counters that can give a zero for both full and empty,
> but those are more complex than this.
... and less complex is better.
The implementation should be simple enough to debug, and obvious to
understand when reading the code.
It may not always easy to keep all the parts straight in your head,
but it should be possible to write top-level documents like:
https://sourceware.org/glibc/wiki/MallocInternals
And keep them updated so anyone using or developing glibc can
understand the top-level heuristics.
Hi Wilco,
On 11-04-2025 11:52, Wilco Dijkstra wrote:
> Hi Cupertino,
>
>> - && tcache->counts[tc_idx] > 0)
>> + && tcache->entries[tc_idx] != NULL)
>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>> Why not check for:
>> tcache->counts[tc_idx] != 0
Right sorry, then maybe different from the value it should be when empty ?
I mean it should always be better to compare with a constant value.
Sizzling the entries will make it more complex, I think.
>
> That checks for tcache not full rather than not empty, so it won't work.
> Pointer swizzling would need an extra swizzle here.
>
>>> - if (tcache->counts[tc_idx] < mp_.tcache_count)
>>> + if (tcache->counts[tc_idx] != 0)
>> You do it here.
>
> This checks whether tcache is not full.
>
> There are alternative schemes possible, eg. using combined
> dual counters that can give a zero for both full and empty,
> but those are more complex than this.
>
> Cheers,
> Wilco
Cheers,
Cupertino
Hi Cupertino,
>> - && tcache->counts[tc_idx] > 0)
>> + && tcache->entries[tc_idx] != NULL)
>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>> Why not check for:
>> tcache->counts[tc_idx] != 0
> Right sorry, then maybe different from the value it should be when empty ?
> I mean it should always be better to compare with a constant value.
> Sizzling the entries will make it more complex, I think.
We don't swizzle the first entry so that you can easily compare with NULL. Since the
goal is to allow a different number of entries in each bin, there isn't a constant or
global one can use either.
You can easily unswizzle if you need to for the large tcache code - after [1] tcache_available
is only used by _mid_memalign, so we can just inline it there. Using tcache_available is not
really useful for large blocks anyway as a non-NULL pointer does not imply a block of the
required size (or larger) is available.
Cheers,
Wilco
[1] https://sourceware.org/pipermail/libc-alpha/2025-April/165720.html
Hi Wilco,
I have no strong opinion on this.
For what is worth, patch is Ok to me.
Also I think it is possible to adapt pointer sizzling leaving the entry
as NULL, and still do the sizzling on the entry.
Will work on such patch on top of my pointer sizzling one.
Cheers,
Cupertino
On 16-04-2025 18:19, Wilco Dijkstra wrote:
> Hi Cupertino,
>
>>> - && tcache->counts[tc_idx] > 0)
>>> + && tcache->entries[tc_idx] != NULL)
>>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>>> Why not check for:
>>> tcache->counts[tc_idx] != 0
>> Right sorry, then maybe different from the value it should be when empty ?
>> I mean it should always be better to compare with a constant value.
>> Sizzling the entries will make it more complex, I think.
>
> We don't swizzle the first entry so that you can easily compare with NULL. Since the
> goal is to allow a different number of entries in each bin, there isn't a constant or
> global one can use either.
>
> You can easily unswizzle if you need to for the large tcache code - after [1] tcache_available
> is only used by _mid_memalign, so we can just inline it there. Using tcache_available is not
> really useful for large blocks anyway as a non-NULL pointer does not imply a block of the
> required size (or larger) is available.
>
> Cheers,
> Wilco
>
> [1] https://sourceware.org/pipermail/libc-alpha/2025-April/165720.html
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> @@ -3337,6 +3337,8 @@ tcache_init(void)
> {
> tcache = (tcache_perthread_struct *) victim;
> memset (tcache, 0, sizeof (tcache_perthread_struct));
> + for (int i = 0; i < TCACHE_MAX_BINS; i++)
> + tcache->counts[i] = mp_.tcache_count;
> }
> }
In theory, the tunables can be changed at any time. The existing code
happens to work with this, at the expense of a global read. We need to
decide if we want to continue supporting that or if tunables are a
set-once-at-startup forever. I have a pending patch set that makes
tunables more dynamic and may need this support, not that I'm biased ;-)
Hi DJ,
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> @@ -3337,6 +3337,8 @@ tcache_init(void)
> {
> tcache = (tcache_perthread_struct *) victim;
> memset (tcache, 0, sizeof (tcache_perthread_struct));
> + for (int i = 0; i < TCACHE_MAX_BINS; i++)
> + tcache->counts[i] = mp_.tcache_count;
> }
> }
> In theory, the tunables can be changed at any time. The existing code
> happens to work with this, at the expense of a global read. We need to
> decide if we want to continue supporting that or if tunables are a
> set-once-at-startup forever. I have a pending patch set that makes
> tunables more dynamic and may need this support, not that I'm biased ;-)
Note after my patch you can still make changes to tcache tunables. Changes
could be immediate for the current thread, but other threads would update
their tcache config on a cold path (like consolidation or an mmap).
And we really should consolidate tcache and remove the insane throw
everything into the unsorted bin, try to merge and then move it all back again...
Basically assuming mxfast is removed, all we need to do is free tcache entries
so that they are either merged or placed in the small bins in their arena.
If we do this periodically then we may not even need to place restrictive limits
on the number of tcache entries - we could consolidate based on number of
calls to malloc/free, total amount of memory in tcache chunks etc. Even if we
kept counts per entry, it makes far more sense to vary the maximum based on
block size rather than use a single value for all entries (7 1024-byte chunks take
the same amount of memory as 224 32-byte chunks!).
So I'm not sure the existing tcache_count is useful at all...
Cheers,
Wilco
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Note after my patch you can still make changes to tcache tunables. Changes
> could be immediate for the current thread, but other threads would update
> their tcache config on a cold path (like consolidation or an mmap).
If the counts array is now "count of empty slots", then a change in
tunables would require a change to every tcache count, to indicate that
more or fewer empty slots are available. Recall that one of the
tunables is the number of chunks cached per tcache bin, so if this
tunable goes up by 2, every counts[] in every thread would need to be
incremented by 2.
It is far easier for threads to just compare *actual* counts with a
changing policy limit.
Also, if "counts" is now counts of *empty* tcache slots, it needs a more
obvious name.
> If we do this periodically then we may not even need to place restrictive limits
> on the number of tcache entries
Given what I've seen of fastbins and fragmentation, some way to limit
growth should always be considered for new malloc features.
Hi DJ,
> It is far easier for threads to just compare *actual* counts with a
> changing policy limit.
It seems easier, but that doesn't mean it works well. Changing global variables
used concurrently by many threads without atomics causes many issues...
Even if you don't get concurrency issues, chunks would be locked into tcache
until the thread terminates if you lower tcache_bins or decrease tcache_count.
The same issue exists for the fast bins.
And the question is, who is interested in changing tcache dynamically?
Is there any application that changes mxfast dynamically? I can see it might
be useful for debugging and testing malloc, but we don't need to add public
interfaces to do that.
> Also, if "counts" is now counts of *empty* tcache slots, it needs a more
> obvious name.
Like "num_slots" or "free_entries"?
>> If we do this periodically then we may not even need to place restrictive limits
>> on the number of tcache entries
>
> Given what I've seen of fastbins and fragmentation, some way to limit
> growth should always be considered for new malloc features.
It would still be a limit on growth if it is done regularly. Even today it would be
good to flush tcache every now and again.
Overall an allocation budget seems better, ie. set a maximum amount of memory
cached in tcache. I'll do some experiments, but if we want to remove fastbins in
the future, we need to improve tcache effectiveness.
Cheers,
Wilco
* DJ Delorie:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>> @@ -3337,6 +3337,8 @@ tcache_init(void)
>> {
>> tcache = (tcache_perthread_struct *) victim;
>> memset (tcache, 0, sizeof (tcache_perthread_struct));
>> + for (int i = 0; i < TCACHE_MAX_BINS; i++)
>> + tcache->counts[i] = mp_.tcache_count;
>> }
>> }
>
> In theory, the tunables can be changed at any time. The existing code
> happens to work with this, at the expense of a global read. We need to
> decide if we want to continue supporting that or if tunables are a
> set-once-at-startup forever. I have a pending patch set that makes
> tunables more dynamic and may need this support, not that I'm biased ;-)
We currently do not have a way to change tunables at run time, and
it's not safe in general. If this were to change and we want this to
be effective for malloc tunables, we'd have to write extra code with
special synchronization either way. So I don't think Wilco's change
makes things much harder.
Hi DJ, Wilco, Florian,
I wonder if we do not have a race around mallopt already.
First look at the do_set_... functions seem to change shared variables
not really taking any consideration for concurrent accesses.
Cheers,
Cupertino
On 07-05-2025 12:15, Wilco Dijkstra wrote:
> Hi DJ,
>
>> It is far easier for threads to just compare *actual* counts with a
>> changing policy limit.
>
> It seems easier, but that doesn't mean it works well. Changing global variables
> used concurrently by many threads without atomics causes many issues...
>
> Even if you don't get concurrency issues, chunks would be locked into tcache
> until the thread terminates if you lower tcache_bins or decrease tcache_count.
> The same issue exists for the fast bins.
>
> And the question is, who is interested in changing tcache dynamically?
> Is there any application that changes mxfast dynamically? I can see it might
> be useful for debugging and testing malloc, but we don't need to add public
> interfaces to do that.
>
>> Also, if "counts" is now counts of *empty* tcache slots, it needs a more
>> obvious name.
>
> Like "num_slots" or "free_entries"?
>
>>> If we do this periodically then we may not even need to place restrictive limits
>>> on the number of tcache entries
>>
>> Given what I've seen of fastbins and fragmentation, some way to limit
>> growth should always be considered for new malloc features.
>
> It would still be a limit on growth if it is done regularly. Even today it would be
> good to flush tcache every now and again.
>
> Overall an allocation budget seems better, ie. set a maximum amount of memory
> cached in tcache. I'll do some experiments, but if we want to remove fastbins in
> the future, we need to improve tcache effectiveness.
>
> Cheers,
> Wilco
On 07-05-2025 13:27, Cupertino Miranda wrote:
> Hi DJ, Wilco, Florian,
>
> I wonder if we do not have a race around mallopt already.
> First look at the do_set_... functions seem to change shared variables
> not really taking any consideration for concurrent accesses.
Not entirely true, __libc_mallopt locks the main arena, still I think
this might not be enough.
>
> Cheers,
> Cupertino
>
>
> On 07-05-2025 12:15, Wilco Dijkstra wrote:
>> Hi DJ,
>>
>>> It is far easier for threads to just compare *actual* counts with a
>>> changing policy limit.
>>
>> It seems easier, but that doesn't mean it works well. Changing global
>> variables
>> used concurrently by many threads without atomics causes many issues...
>>
>> Even if you don't get concurrency issues, chunks would be locked into
>> tcache
>> until the thread terminates if you lower tcache_bins or decrease
>> tcache_count.
>> The same issue exists for the fast bins.
>>
>> And the question is, who is interested in changing tcache dynamically?
>> Is there any application that changes mxfast dynamically? I can see it
>> might
>> be useful for debugging and testing malloc, but we don't need to add
>> public
>> interfaces to do that.
>>
>>> Also, if "counts" is now counts of *empty* tcache slots, it needs a more
>>> obvious name.
>>
>> Like "num_slots" or "free_entries"?
>>
>>>> If we do this periodically then we may not even need to place
>>>> restrictive limits
>>>> on the number of tcache entries
>>>
>>> Given what I've seen of fastbins and fragmentation, some way to limit
>>> growth should always be considered for new malloc features.
>>
>> It would still be a limit on growth if it is done regularly. Even
>> today it would be
>> good to flush tcache every now and again.
>>
>> Overall an allocation budget seems better, ie. set a maximum amount of
>> memory
>> cached in tcache. I'll do some experiments, but if we want to remove
>> fastbins in
>> the future, we need to improve tcache effectiveness.
>>
>> Cheers,
>> Wilco
>
Hi Cupertino,
> I wonder if we do not have a race around mallopt already.
> First look at the do_set_... functions seem to change shared variables
> not really taking any consideration for concurrent accesses.
We do indeed. It's probably not a huge issue that there is no use of
atomics (only a few older targets will have issues), however none of the
code was designed with concurrent changes of the globals in mind.
So the same global can change when you read it multiple times,
and you have to somehow deal with that correctly...
Cheers,
Wilco
On 07/05/25 08:42, Florian Weimer wrote:
> * DJ Delorie:
>
>> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>>> @@ -3337,6 +3337,8 @@ tcache_init(void)
>>> {
>>> tcache = (tcache_perthread_struct *) victim;
>>> memset (tcache, 0, sizeof (tcache_perthread_struct));
>>> + for (int i = 0; i < TCACHE_MAX_BINS; i++)
>>> + tcache->counts[i] = mp_.tcache_count;
>>> }
>>> }
>>
>> In theory, the tunables can be changed at any time. The existing code
>> happens to work with this, at the expense of a global read. We need to
>> decide if we want to continue supporting that or if tunables are a
>> set-once-at-startup forever. I have a pending patch set that makes
>> tunables more dynamic and may need this support, not that I'm biased ;-)
>
> We currently do not have a way to change tunables at run time, and
> it's not safe in general. If this were to change and we want this to
> be effective for malloc tunables, we'd have to write extra code with
> special synchronization either way. So I don't think Wilco's change
> makes things much harder.
The tunable_list is currently marked as attribute_relro and I think the
current idea of the tunable framework is to have set-once-at-startup. It
means the user knows which parameters was set so it can know which kind
of performance, security, or any other values are used and expected during
program execution.
Besides the synchronization issues, there is also possible security
implications if we start to make the tunable_list no rdonly during
process execution (since we use to enable PAC, memory tagging, and now
GCS support). We will need to at least split between two list, which
will adds more complexity.
So I think for malloc we should keep using the tunables as rdonly and
set-once-at-startup and add extra internal variables to have this
dynamic mechanism. What kind of support are envisioning that requires
dynamic tunables?
On 07-05-2025 13:42, Wilco Dijkstra wrote:
> Hi Cupertino,
>
>> I wonder if we do not have a race around mallopt already.
>> First look at the do_set_... functions seem to change shared variables
>> not really taking any consideration for concurrent accesses.
>
> We do indeed. It's probably not a huge issue that there is no use of
> atomics (only a few older targets will have issues), however none of the
> code was designed with concurrent changes of the globals in mind.
> So the same global can change when you read it multiple times,
> and you have to somehow deal with that correctly...
Right! Some weird side effects could happen in those cases.
>
> Cheers,
> Wilco
Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> writes:
> The tunable_list is currently marked as attribute_relro and I think the
> current idea of the tunable framework is to have set-once-at-startup.
Given that and the other comments, I'm ok with the concept of recording
free slots instead of used slots for tcache.
tl;dr: needs to be rebased to head, and ->counts renamed, but logic is
OK. LGTM with those changes.
Reviewed-by: DJ Delorie <dj@redhat.com>
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Currently tcache requires 2 global variable accesses to determine
> whether a block can be added to the tcache. Change the counts array
> to indicate the number of entries that could be added by counting
> downwards. If the count reaches zero, no more blocks can be added.
> If the entries pointer is not NULL, at least one block is available
> for allocation.
>
> Now each tcache bin can support a different maximum number of blocks,
> and they can be individually switched on or off (a zero initialized
> count+entry means the tcache bin is not available for free or malloc).
FYI this patch no longer applies to master (malloc.c is busy! yay!) but
I'll review the code as-is. I'll also ignore the request to rename
tcache->counts to something that reflects its new purpose, as we could
bikeshed that forever.
> @@ -3169,7 +3169,7 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>
> e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
> tcache->entries[tc_idx] = e;
> - ++(tcache->counts[tc_idx]);
> + --(tcache->counts[tc_idx]);
> }
tcache->counts is how many more chunks we can put on the list, so adding
something to the list (tcache_put) should reduce this number. Ok.
> @@ -3192,7 +3192,7 @@ tcache_get_n (size_t tc_idx, tcache_entry **ep)
> else
> *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
>
> - --(tcache->counts[tc_idx]);
> + ++(tcache->counts[tc_idx]);
> e->key = 0;
> return (void *) e;
This is the opposite (tcache_get) so should increment, since we're
"freeing" a slot. Ok.
> @@ -3217,7 +3217,7 @@ tcache_available (size_t tc_idx)
> {
> if (tc_idx < mp_.tcache_bins
> && tcache != NULL
> - && tcache->counts[tc_idx] > 0)
> + && tcache->entries[tc_idx] != NULL)
> return true;
> else
> return false;
In this version, we protect pointers that are inside the chunks
(e->next) but not pointers outside of chunks (tcache->entries).
So the check here goes from "at least one chunk" to "list is not
empty". Ok.
> @@ -3265,7 +3265,7 @@ tcache_free (mchunkptr p, INTERNAL_SIZE_T size)
> if (__glibc_unlikely (e->key == tcache_key))
> tcache_double_free_verify (e, tc_idx);
>
> - if (tcache->counts[tc_idx] < mp_.tcache_count)
> + if (tcache->counts[tc_idx] != 0)
> {
This is testing for "can we put a chunk in tcache". The new logic is
"has slots". tcache->counts is a UINT16 so !=0 and >0 are semantically
the same here, so ok.
> @@ -3337,6 +3337,8 @@ tcache_init(void)
> {
> tcache = (tcache_perthread_struct *) victim;
> memset (tcache, 0, sizeof (tcache_perthread_struct));
> + for (int i = 0; i < TCACHE_MAX_BINS; i++)
> + tcache->counts[i] = mp_.tcache_count;
> }
This is the size of tcache->counts; we initialize each to the tunable
value so we start with "count slots free." Tunables are set by
ptmalloc_init, which is called from libc_early_init.c, so they'll be set
by this point. Ok.
> @@ -4006,8 +4008,7 @@ _int_malloc (mstate av, size_t bytes)
> mchunkptr tc_victim;
>
> /* While bin not empty and tcache not full, copy chunks. */
> - while (tcache->counts[tc_idx] < mp_.tcache_count
> - && (tc_victim = *fb) != NULL)
> + while (tcache->counts[tc_idx] != 0 && (tc_victim = *fb) != NULL)
> {
Ok.
> @@ -4067,8 +4068,7 @@ _int_malloc (mstate av, size_t bytes)
> mchunkptr tc_victim;
>
> /* While bin not empty and tcache not full, copy chunks over. */
> - while (tcache->counts[tc_idx] < mp_.tcache_count
> - && (tc_victim = last (bin)) != bin)
> + while (tcache->counts[tc_idx] != 0 && (tc_victim = last (bin)) != bin)
> {
Ok.
> @@ -4204,8 +4204,7 @@ _int_malloc (mstate av, size_t bytes)
> #if USE_TCACHE
> /* Fill cache first, return to user only if cache fills.
> We may return one of these chunks later. */
> - if (tcache_nb > 0
> - && tcache->counts[tc_idx] < mp_.tcache_count)
> + if (tcache_nb > 0 && tcache->counts[tc_idx] != 0)
> {
Ok.
@@ -3169,7 +3169,7 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
tcache->entries[tc_idx] = e;
- ++(tcache->counts[tc_idx]);
+ --(tcache->counts[tc_idx]);
}
/* Caller must ensure that we know tc_idx is valid and there's
@@ -3192,7 +3192,7 @@ tcache_get_n (size_t tc_idx, tcache_entry **ep)
else
*ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
- --(tcache->counts[tc_idx]);
+ ++(tcache->counts[tc_idx]);
e->key = 0;
return (void *) e;
}
@@ -3217,7 +3217,7 @@ tcache_available (size_t tc_idx)
{
if (tc_idx < mp_.tcache_bins
&& tcache != NULL
- && tcache->counts[tc_idx] > 0)
+ && tcache->entries[tc_idx] != NULL)
return true;
else
return false;
@@ -3265,7 +3265,7 @@ tcache_free (mchunkptr p, INTERNAL_SIZE_T size)
if (__glibc_unlikely (e->key == tcache_key))
tcache_double_free_verify (e, tc_idx);
- if (tcache->counts[tc_idx] < mp_.tcache_count)
+ if (tcache->counts[tc_idx] != 0)
{
tcache_put (p, tc_idx);
done = true;
@@ -3337,6 +3337,8 @@ tcache_init(void)
{
tcache = (tcache_perthread_struct *) victim;
memset (tcache, 0, sizeof (tcache_perthread_struct));
+ for (int i = 0; i < TCACHE_MAX_BINS; i++)
+ tcache->counts[i] = mp_.tcache_count;
}
}
@@ -4006,8 +4008,7 @@ _int_malloc (mstate av, size_t bytes)
mchunkptr tc_victim;
/* While bin not empty and tcache not full, copy chunks. */
- while (tcache->counts[tc_idx] < mp_.tcache_count
- && (tc_victim = *fb) != NULL)
+ while (tcache->counts[tc_idx] != 0 && (tc_victim = *fb) != NULL)
{
if (__glibc_unlikely (misaligned_chunk (tc_victim)))
malloc_printerr ("malloc(): unaligned fastbin chunk detected 3");
@@ -4067,8 +4068,7 @@ _int_malloc (mstate av, size_t bytes)
mchunkptr tc_victim;
/* While bin not empty and tcache not full, copy chunks over. */
- while (tcache->counts[tc_idx] < mp_.tcache_count
- && (tc_victim = last (bin)) != bin)
+ while (tcache->counts[tc_idx] != 0 && (tc_victim = last (bin)) != bin)
{
if (tc_victim != NULL)
{
@@ -4204,8 +4204,7 @@ _int_malloc (mstate av, size_t bytes)
#if USE_TCACHE
/* Fill cache first, return to user only if cache fills.
We may return one of these chunks later. */
- if (tcache_nb > 0
- && tcache->counts[tc_idx] < mp_.tcache_count)
+ if (tcache_nb > 0 && tcache->counts[tc_idx] != 0)
{
tcache_put (victim, tc_idx);
return_cached = 1;