malloc: Count tcache entries downwards

Message ID PAWPR08MB8982F23E02BA7BBDC488369083B72@PAWPR08MB8982.eurprd08.prod.outlook.com (mailing list archive)
State New
Headers
Series malloc: Count tcache entries downwards |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
redhat-pt-bot/TryBot-32bit success Build for i686

Commit Message

Wilco Dijkstra April 10, 2025, 1:43 p.m. UTC
  Currently tcache requires 2 global variable accesses to determine
whether a block can be added to the tcache.  Change the counts array
to indicate the number of entries that could be added by counting
downwards.  If the count reaches zero, no more blocks can be added.
If the entries pointer is not NULL, at least one block is available
for allocation.

Now each tcache bin can support a different maximum number of blocks,
and they can be individually switched on or off (a zero initialized
count+entry means the tcache bin is not available for free or malloc).

Performance is neutral on current trunk (+1.4% on malloc-bench-thread 32),
but after the outstanding improvements to free it gives ~5%.

---
  

Comments

Cupertino Miranda April 11, 2025, 7:35 a.m. UTC | #1
Hi Wilco,

On 10-04-2025 14:43, Wilco Dijkstra wrote:
> 
> Currently tcache requires 2 global variable accesses to determine
> whether a block can be added to the tcache.  Change the counts array
> to indicate the number of entries that could be added by counting
> downwards.  If the count reaches zero, no more blocks can be added.
> If the entries pointer is not NULL, at least one block is available
> for allocation.
> 
> Now each tcache bin can support a different maximum number of blocks,
> and they can be individually switched on or off (a zero initialized
> count+entry means the tcache bin is not available for free or malloc).
> 
> Performance is neutral on current trunk (+1.4% on malloc-bench-thread 32),
> but after the outstanding improvements to free it gives ~5%.
> 
> ---
> 
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index a0bc733482532ce34684d0357cb9076b03ac8a52..c3e46518c5335467bb9b6ba0f0e96c4d18ee25e6 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3169,7 +3169,7 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>   
>     e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
>     tcache->entries[tc_idx] = e;
> -  ++(tcache->counts[tc_idx]);
> +  --(tcache->counts[tc_idx]);
>   }
>   
>   /* Caller must ensure that we know tc_idx is valid and there's
> @@ -3192,7 +3192,7 @@ tcache_get_n (size_t tc_idx, tcache_entry **ep)
>     else
>       *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
>   
> -  --(tcache->counts[tc_idx]);
> +  ++(tcache->counts[tc_idx]);
>     e->key = 0;
>     return (void *) e;
>   }
> @@ -3217,7 +3217,7 @@ tcache_available (size_t tc_idx)
>   {
>     if (tc_idx < mp_.tcache_bins
>         && tcache != NULL
> -      && tcache->counts[tc_idx] > 0)
> +      && tcache->entries[tc_idx] != NULL)
This will not be Ok for the pointer sizzling, which sizzles the entries.
Why not check for:
   tcache->counts[tc_idx] != 0

>       return true;
>     else
>       return false;
> @@ -3265,7 +3265,7 @@ tcache_free (mchunkptr p, INTERNAL_SIZE_T size)
>         if (__glibc_unlikely (e->key == tcache_key))
>   	tcache_double_free_verify (e, tc_idx);
>   
> -      if (tcache->counts[tc_idx] < mp_.tcache_count)
> +      if (tcache->counts[tc_idx] != 0)
You do it here.
>   	{
>   	  tcache_put (p, tc_idx);
>   	  done = true;
> @@ -3337,6 +3337,8 @@ tcache_init(void)
>       {
>         tcache = (tcache_perthread_struct *) victim;
>         memset (tcache, 0, sizeof (tcache_perthread_struct));
> +      for (int i = 0; i < TCACHE_MAX_BINS; i++)
> +	tcache->counts[i] = mp_.tcache_count;
>       }
>   
>   }
> @@ -4006,8 +4008,7 @@ _int_malloc (mstate av, size_t bytes)
>   		  mchunkptr tc_victim;
>   
>   		  /* While bin not empty and tcache not full, copy chunks.  */
> -		  while (tcache->counts[tc_idx] < mp_.tcache_count
> -			 && (tc_victim = *fb) != NULL)
> +		  while (tcache->counts[tc_idx] != 0 && (tc_victim = *fb) != NULL)
>   		    {
>   		      if (__glibc_unlikely (misaligned_chunk (tc_victim)))
>   			malloc_printerr ("malloc(): unaligned fastbin chunk detected 3");
> @@ -4067,8 +4068,7 @@ _int_malloc (mstate av, size_t bytes)
>   	      mchunkptr tc_victim;
>   
>   	      /* While bin not empty and tcache not full, copy chunks over.  */
> -	      while (tcache->counts[tc_idx] < mp_.tcache_count
> -		     && (tc_victim = last (bin)) != bin)
> +	      while (tcache->counts[tc_idx] != 0 && (tc_victim = last (bin)) != bin)
>   		{
>   		  if (tc_victim != NULL)
>   		    {
> @@ -4204,8 +4204,7 @@ _int_malloc (mstate av, size_t bytes)
>   #if USE_TCACHE
>   	      /* Fill cache first, return to user only if cache fills.
>   		 We may return one of these chunks later.  */
> -	      if (tcache_nb > 0
> -		  && tcache->counts[tc_idx] < mp_.tcache_count)
> +	      if (tcache_nb > 0 && tcache->counts[tc_idx] != 0)
>   		{
>   		  tcache_put (victim, tc_idx);
>   		  return_cached = 1;
> 
>
  
Wilco Dijkstra April 11, 2025, 10:52 a.m. UTC | #2
Hi Cupertino,

> -      && tcache->counts[tc_idx] > 0)
> +      && tcache->entries[tc_idx] != NULL)
> This will not be Ok for the pointer sizzling, which sizzles the entries.
> Why not check for:
>   tcache->counts[tc_idx] != 0

That checks for tcache not full rather than not empty, so it won't work.
Pointer swizzling would need an extra swizzle here.

>> -      if (tcache->counts[tc_idx] < mp_.tcache_count)
>> +      if (tcache->counts[tc_idx] != 0)
> You do it here.

This checks whether tcache is not full.

There are alternative schemes possible, eg. using combined
dual counters that can give a zero for both full and empty,
but those are more complex than this.

Cheers,
Wilco
  
Carlos O'Donell April 11, 2025, 12:22 p.m. UTC | #3
On 4/11/25 6:52 AM, Wilco Dijkstra wrote:
> Hi Cupertino,
> 
>> -      && tcache->counts[tc_idx] > 0)
>> +      && tcache->entries[tc_idx] != NULL)
>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>> Why not check for:
>>     tcache->counts[tc_idx] != 0
> 
> That checks for tcache not full rather than not empty, so it won't work.
> Pointer swizzling would need an extra swizzle here.
> 
>>> -      if (tcache->counts[tc_idx] < mp_.tcache_count)
>>> +      if (tcache->counts[tc_idx] != 0)
>> You do it here.
> 
> This checks whether tcache is not full.
> 
> There are alternative schemes possible, eg. using combined
> dual counters that can give a zero for both full and empty,
> but those are more complex than this.

... and less complex is better.

The implementation should be simple enough to debug, and obvious to
understand when reading the code.

It may not always easy to keep all the parts straight in your head,
but it should be possible to write top-level documents like:

https://sourceware.org/glibc/wiki/MallocInternals

And keep them updated so anyone using or developing glibc can
understand the top-level heuristics.
  
Cupertino Miranda April 11, 2025, 12:49 p.m. UTC | #4
Hi Wilco,

On 11-04-2025 11:52, Wilco Dijkstra wrote:
> Hi Cupertino,
> 
>> -      && tcache->counts[tc_idx] > 0)
>> +      && tcache->entries[tc_idx] != NULL)
>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>> Why not check for:
>>     tcache->counts[tc_idx] != 0
Right sorry, then maybe different from the value it should be when empty ?
I mean it should always be better to compare with a constant value. 
Sizzling the entries will make it more complex, I think.

> 
> That checks for tcache not full rather than not empty, so it won't work.
> Pointer swizzling would need an extra swizzle here.
> 
>>> -      if (tcache->counts[tc_idx] < mp_.tcache_count)
>>> +      if (tcache->counts[tc_idx] != 0)
>> You do it here.
> 
> This checks whether tcache is not full.
> 
> There are alternative schemes possible, eg. using combined
> dual counters that can give a zero for both full and empty,
> but those are more complex than this.
> 
> Cheers,
> Wilco

Cheers,
Cupertino
  
Wilco Dijkstra April 16, 2025, 5:19 p.m. UTC | #5
Hi Cupertino,

>> -      && tcache->counts[tc_idx] > 0)
>> +      && tcache->entries[tc_idx] != NULL)
>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>> Why not check for:
>>     tcache->counts[tc_idx] != 0
> Right sorry, then maybe different from the value it should be when empty ?
> I mean it should always be better to compare with a constant value.
> Sizzling the entries will make it more complex, I think.

We don't swizzle the first entry so that you can easily compare with NULL. Since the
goal is to allow a different number of entries in each bin, there isn't a constant or
global one can use either.

You can easily unswizzle if you need to for the large tcache code - after [1] tcache_available
is only used by _mid_memalign, so we can just inline it there. Using tcache_available is not
really useful for large blocks anyway as a non-NULL pointer does not imply a block of the
required size (or larger) is available.

Cheers,
Wilco

[1] https://sourceware.org/pipermail/libc-alpha/2025-April/165720.html
  
Cupertino Miranda April 17, 2025, 11:54 a.m. UTC | #6
Hi Wilco,

I have no strong opinion on this.
For what is worth, patch is Ok to me.

Also I think it is possible to adapt pointer sizzling leaving the entry 
as NULL, and still do the sizzling on the entry.
Will work on such patch on top of my pointer sizzling one.

Cheers,
Cupertino

On 16-04-2025 18:19, Wilco Dijkstra wrote:
> Hi Cupertino,
> 
>>> -      && tcache->counts[tc_idx] > 0)
>>> +      && tcache->entries[tc_idx] != NULL)
>>> This will not be Ok for the pointer sizzling, which sizzles the entries.
>>> Why not check for:
>>>       tcache->counts[tc_idx] != 0
>> Right sorry, then maybe different from the value it should be when empty ?
>> I mean it should always be better to compare with a constant value.
>> Sizzling the entries will make it more complex, I think.
> 
> We don't swizzle the first entry so that you can easily compare with NULL. Since the
> goal is to allow a different number of entries in each bin, there isn't a constant or
> global one can use either.
> 
> You can easily unswizzle if you need to for the large tcache code - after [1] tcache_available
> is only used by _mid_memalign, so we can just inline it there. Using tcache_available is not
> really useful for large blocks anyway as a non-NULL pointer does not imply a block of the
> required size (or larger) is available.
> 
> Cheers,
> Wilco
> 
> [1] https://sourceware.org/pipermail/libc-alpha/2025-April/165720.html
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index a0bc733482532ce34684d0357cb9076b03ac8a52..c3e46518c5335467bb9b6ba0f0e96c4d18ee25e6 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3169,7 +3169,7 @@  tcache_put (mchunkptr chunk, size_t tc_idx)
 
   e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
   tcache->entries[tc_idx] = e;
-  ++(tcache->counts[tc_idx]);
+  --(tcache->counts[tc_idx]);
 }
 
 /* Caller must ensure that we know tc_idx is valid and there's
@@ -3192,7 +3192,7 @@  tcache_get_n (size_t tc_idx, tcache_entry **ep)
   else
     *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
 
-  --(tcache->counts[tc_idx]);
+  ++(tcache->counts[tc_idx]);
   e->key = 0;
   return (void *) e;
 }
@@ -3217,7 +3217,7 @@  tcache_available (size_t tc_idx)
 {
   if (tc_idx < mp_.tcache_bins
       && tcache != NULL
-      && tcache->counts[tc_idx] > 0)
+      && tcache->entries[tc_idx] != NULL)
     return true;
   else
     return false;
@@ -3265,7 +3265,7 @@  tcache_free (mchunkptr p, INTERNAL_SIZE_T size)
       if (__glibc_unlikely (e->key == tcache_key))
 	tcache_double_free_verify (e, tc_idx);
 
-      if (tcache->counts[tc_idx] < mp_.tcache_count)
+      if (tcache->counts[tc_idx] != 0)
 	{
 	  tcache_put (p, tc_idx);
 	  done = true;
@@ -3337,6 +3337,8 @@  tcache_init(void)
     {
       tcache = (tcache_perthread_struct *) victim;
       memset (tcache, 0, sizeof (tcache_perthread_struct));
+      for (int i = 0; i < TCACHE_MAX_BINS; i++)
+	tcache->counts[i] = mp_.tcache_count;
     }
 
 }
@@ -4006,8 +4008,7 @@  _int_malloc (mstate av, size_t bytes)
 		  mchunkptr tc_victim;
 
 		  /* While bin not empty and tcache not full, copy chunks.  */
-		  while (tcache->counts[tc_idx] < mp_.tcache_count
-			 && (tc_victim = *fb) != NULL)
+		  while (tcache->counts[tc_idx] != 0 && (tc_victim = *fb) != NULL)
 		    {
 		      if (__glibc_unlikely (misaligned_chunk (tc_victim)))
 			malloc_printerr ("malloc(): unaligned fastbin chunk detected 3");
@@ -4067,8 +4068,7 @@  _int_malloc (mstate av, size_t bytes)
 	      mchunkptr tc_victim;
 
 	      /* While bin not empty and tcache not full, copy chunks over.  */
-	      while (tcache->counts[tc_idx] < mp_.tcache_count
-		     && (tc_victim = last (bin)) != bin)
+	      while (tcache->counts[tc_idx] != 0 && (tc_victim = last (bin)) != bin)
 		{
 		  if (tc_victim != NULL)
 		    {
@@ -4204,8 +4204,7 @@  _int_malloc (mstate av, size_t bytes)
 #if USE_TCACHE
 	      /* Fill cache first, return to user only if cache fills.
 		 We may return one of these chunks later.  */
-	      if (tcache_nb > 0
-		  && tcache->counts[tc_idx] < mp_.tcache_count)
+	      if (tcache_nb > 0 && tcache->counts[tc_idx] != 0)
 		{
 		  tcache_put (victim, tc_idx);
 		  return_cached = 1;