[1/6] malloc: Move mmap code out of __libc_free hotpath

Message ID PAWPR08MB8982C6721E02AC0F3C47905B83AD2@PAWPR08MB8982.eurprd08.prod.outlook.com (mailing list archive)
State Accepted
Delegated to: Adhemerval Zanella Netto
Headers
Series [1/6] malloc: Move mmap code out of __libc_free hotpath |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed

Commit Message

Wilco Dijkstra March 31, 2025, 12:54 p.m. UTC
  Currently __libc_free checks for a freed mmap chunk in the fast path.
Also errno is always saved and restored to preserve it.  Since mmap chunks
are larger than the largest tcache chunk, it is safe to delay this and
handle tcache, smallbin and medium bin blocks first.  Move saving of errno
to cases that actually need it.  Remove a safety check that fails on mmap
chunks and a check that mmap chunks cannot be added to tcache. 

Performance of bench-malloc-thread improves by 9.2% for 1 thread and
6.9% for 32 threads on Neoverse V2.

Passes regress, OK for commit?

---
  

Comments

Adhemerval Zanella Netto April 11, 2025, 7:21 p.m. UTC | #1
On 31/03/25 09:54, Wilco Dijkstra wrote:
> 
> Currently __libc_free checks for a freed mmap chunk in the fast path.
> Also errno is always saved and restored to preserve it.  Since mmap chunks
> are larger than the largest tcache chunk, it is safe to delay this and
> handle tcache, smallbin and medium bin blocks first.  Move saving of errno
> to cases that actually need it.  Remove a safety check that fails on mmap
> chunks and a check that mmap chunks cannot be added to tcache. 
> 
> Performance of bench-malloc-thread improves by 9.2% for 1 thread and
> 6.9% for 32 threads on Neoverse V2.
> 
> Passes regress, OK for commit?
> 
> ---
> 
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index a0bc733482532ce34684d0357cb9076b03ac8a52..fc2f751174d88d1029f04958ca1ebf37ca48bd87 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3316,6 +3316,14 @@ tcache_init(void)
>    if (tcache_shutting_down)
>      return;
>  
> +  /* Check minimum mmap chunk is larger than max tcache size.  This means
> +     mmap chunks with their different layout are never added to tcache.  */
> +  if (MAX_TCACHE_SIZE >= GLRO (dl_pagesize) / 2)
> +    malloc_printerr ("max tcache size too large");

Ok. I think we can eventually make it an _Static_assert once we have the pagesize.h
with the max and min support page sizes values.

And checking on this code, it makes me wonder if glibc.malloc.tcache_max tunable
is really worth. Without it, I think we can optimize tcache_available code
generation since the mp_.tcache_bins could be replaced by a constant.

> +
> +  /* Preserve errno when called from free() - _int_malloc may corrupt it.  */
> +  int err = errno;
> +
>    arena_get (ar_ptr, bytes);
>    victim = _int_malloc (ar_ptr, bytes);
>    if (!victim && ar_ptr != NULL)
> @@ -3324,10 +3332,11 @@ tcache_init(void)
>        victim = _int_malloc (ar_ptr, bytes);
>      }
>  
> -

Spurious line removal.

>    if (ar_ptr != NULL)
>      __libc_lock_unlock (ar_ptr->mutex);
>  
> +  __set_errno (err);
> +
>    /* In a low memory situation, we may not be able to allocate memory
>       - in which case, we just keep trying later.  However, we
>       typically do this very early, so either there is sufficient
> @@ -3453,37 +3462,15 @@ __libc_free (void *mem)
>    if (__glibc_unlikely (mtag_enabled))
>      *(volatile char *)mem;
>  
> -  int err = errno;
> -
>    p = mem2chunk (mem);
>  
> -  if (chunk_is_mmapped (p))                       /* release mmapped memory. */
> -    {
> -      /* See if the dynamic brk/mmap threshold needs adjusting.
> -	 Dumped fake mmapped chunks do not affect the threshold.  */
> -      if (!mp_.no_dyn_threshold
> -          && chunksize_nomask (p) > mp_.mmap_threshold
> -          && chunksize_nomask (p) <= DEFAULT_MMAP_THRESHOLD_MAX)
> -        {
> -          mp_.mmap_threshold = chunksize (p);
> -          mp_.trim_threshold = 2 * mp_.mmap_threshold;
> -          LIBC_PROBE (memory_mallopt_free_dyn_thresholds, 2,
> -                      mp_.mmap_threshold, mp_.trim_threshold);
> -        }
> -      munmap_chunk (p);
> -    }
> -  else
> -    {
> -      MAYBE_INIT_TCACHE ();
> -
> -      /* Mark the chunk as belonging to the library again.  */
> -      (void)tag_region (chunk2mem (p), memsize (p));
> +  MAYBE_INIT_TCACHE ();
>  
> -      ar_ptr = arena_for_chunk (p);
> -      _int_free (ar_ptr, p, 0);
> -    }
> +  /* Mark the chunk as belonging to the library again.  */
> +  (void)tag_region (chunk2mem (p), memsize (p));

I don't think we strictly need the return cast.

>  
> -  __set_errno (err);
> +  ar_ptr = arena_for_chunk (p);
> +  _int_free (ar_ptr, p, 0);
>  }
>  libc_hidden_def (__libc_free)
>  
> @@ -4570,9 +4557,8 @@ _int_free_check (mstate av, mchunkptr p, INTERNAL_SIZE_T size)
>    if (__builtin_expect ((uintptr_t) p > (uintptr_t) -size, 0)
>        || __builtin_expect (misaligned_chunk (p), 0))
>      malloc_printerr ("free(): invalid pointer");
> -  /* We know that each chunk is at least MINSIZE bytes in size or a
> -     multiple of MALLOC_ALIGNMENT.  */
> -  if (__glibc_unlikely (size < MINSIZE || !aligned_OK (size)))
> +  /* We know that each chunk is at least MINSIZE bytes.  */
> +  if (__glibc_unlikely (size < MINSIZE))
>      malloc_printerr ("free(): invalid size");
>  
>    check_inuse_chunk (av, p);
> @@ -4669,6 +4655,9 @@ _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
>  
>    else if (!chunk_is_mmapped(p)) {
>  
> +    /* Preserve errno in case block merging results in munmap.  */
> +    int err = errno;
> +
>      /* If we're single-threaded, don't lock the arena.  */
>      if (SINGLE_THREAD_P)
>        have_lock = true;
> @@ -4680,13 +4669,33 @@ _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
>  
>      if (!have_lock)
>        __libc_lock_unlock (av->mutex);
> +
> +    __set_errno (err);
>    }
>    /*
>      If the chunk was allocated via mmap, release via munmap().
>    */
>  
>    else {
> +
> +    /* Preserve errno in case munmap sets it.  */
> +    int err = errno;
> +
> +    /* See if the dynamic brk/mmap threshold needs adjusting.
> +       Dumped fake mmapped chunks do not affect the threshold.  */
> +    if (!mp_.no_dyn_threshold
> +        && chunksize_nomask (p) > mp_.mmap_threshold
> +        && chunksize_nomask (p) <= DEFAULT_MMAP_THRESHOLD_MAX)
> +      {
> +        mp_.mmap_threshold = chunksize (p);
> +        mp_.trim_threshold = 2 * mp_.mmap_threshold;
> +        LIBC_PROBE (memory_mallopt_free_dyn_thresholds, 2,
> +		    mp_.mmap_threshold, mp_.trim_threshold);
> +      }
> +
>      munmap_chunk (p);
> +
> +    __set_errno (err);
>    }
>  }
>  

The rest look ok. I think DJ also give this a RB.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
  
Wilco Dijkstra April 15, 2025, 11:42 a.m. UTC | #2
Hi Adhemerval,

> +  if (MAX_TCACHE_SIZE >= GLRO (dl_pagesize) / 2)
> +    malloc_printerr ("max tcache size too large");

> Ok. I think we can eventually make it an _Static_assert once we have the pagesize.h
> with the max and min support page sizes values.

Yes it could be. However for simplicity we should fix the slightly different size calculation
for mmap'd chunks as well.

> And checking on this code, it makes me wonder if glibc.malloc.tcache_max tunable
> is really worth. Without it, I think we can optimize tcache_available code
> generation since the mp_.tcache_bins could be replaced by a constant.

That is what I am doing after changing the way tcache counters work. That allows
individual bins to be turned off, so there is no longer any need to global variables.

> @@ -3324,10 +3332,11 @@ tcache_init(void)
>        victim = _int_malloc (ar_ptr, bytes);
>      }

> -

> Spurious line removal.

Fixed.

> +  /* Mark the chunk as belonging to the library again.  */
> +  (void)tag_region (chunk2mem (p), memsize (p));

> I don't think we strictly need the return cast.

Indeed, I've removed it in the commit.

Cheers,
Wilco
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index a0bc733482532ce34684d0357cb9076b03ac8a52..fc2f751174d88d1029f04958ca1ebf37ca48bd87 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3316,6 +3316,14 @@  tcache_init(void)
   if (tcache_shutting_down)
     return;
 
+  /* Check minimum mmap chunk is larger than max tcache size.  This means
+     mmap chunks with their different layout are never added to tcache.  */
+  if (MAX_TCACHE_SIZE >= GLRO (dl_pagesize) / 2)
+    malloc_printerr ("max tcache size too large");
+
+  /* Preserve errno when called from free() - _int_malloc may corrupt it.  */
+  int err = errno;
+
   arena_get (ar_ptr, bytes);
   victim = _int_malloc (ar_ptr, bytes);
   if (!victim && ar_ptr != NULL)
@@ -3324,10 +3332,11 @@  tcache_init(void)
       victim = _int_malloc (ar_ptr, bytes);
     }
 
-
   if (ar_ptr != NULL)
     __libc_lock_unlock (ar_ptr->mutex);
 
+  __set_errno (err);
+
   /* In a low memory situation, we may not be able to allocate memory
      - in which case, we just keep trying later.  However, we
      typically do this very early, so either there is sufficient
@@ -3453,37 +3462,15 @@  __libc_free (void *mem)
   if (__glibc_unlikely (mtag_enabled))
     *(volatile char *)mem;
 
-  int err = errno;
-
   p = mem2chunk (mem);
 
-  if (chunk_is_mmapped (p))                       /* release mmapped memory. */
-    {
-      /* See if the dynamic brk/mmap threshold needs adjusting.
-	 Dumped fake mmapped chunks do not affect the threshold.  */
-      if (!mp_.no_dyn_threshold
-          && chunksize_nomask (p) > mp_.mmap_threshold
-          && chunksize_nomask (p) <= DEFAULT_MMAP_THRESHOLD_MAX)
-        {
-          mp_.mmap_threshold = chunksize (p);
-          mp_.trim_threshold = 2 * mp_.mmap_threshold;
-          LIBC_PROBE (memory_mallopt_free_dyn_thresholds, 2,
-                      mp_.mmap_threshold, mp_.trim_threshold);
-        }
-      munmap_chunk (p);
-    }
-  else
-    {
-      MAYBE_INIT_TCACHE ();
-
-      /* Mark the chunk as belonging to the library again.  */
-      (void)tag_region (chunk2mem (p), memsize (p));
+  MAYBE_INIT_TCACHE ();
 
-      ar_ptr = arena_for_chunk (p);
-      _int_free (ar_ptr, p, 0);
-    }
+  /* Mark the chunk as belonging to the library again.  */
+  (void)tag_region (chunk2mem (p), memsize (p));
 
-  __set_errno (err);
+  ar_ptr = arena_for_chunk (p);
+  _int_free (ar_ptr, p, 0);
 }
 libc_hidden_def (__libc_free)
 
@@ -4570,9 +4557,8 @@  _int_free_check (mstate av, mchunkptr p, INTERNAL_SIZE_T size)
   if (__builtin_expect ((uintptr_t) p > (uintptr_t) -size, 0)
       || __builtin_expect (misaligned_chunk (p), 0))
     malloc_printerr ("free(): invalid pointer");
-  /* We know that each chunk is at least MINSIZE bytes in size or a
-     multiple of MALLOC_ALIGNMENT.  */
-  if (__glibc_unlikely (size < MINSIZE || !aligned_OK (size)))
+  /* We know that each chunk is at least MINSIZE bytes.  */
+  if (__glibc_unlikely (size < MINSIZE))
     malloc_printerr ("free(): invalid size");
 
   check_inuse_chunk (av, p);
@@ -4669,6 +4655,9 @@  _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
 
   else if (!chunk_is_mmapped(p)) {
 
+    /* Preserve errno in case block merging results in munmap.  */
+    int err = errno;
+
     /* If we're single-threaded, don't lock the arena.  */
     if (SINGLE_THREAD_P)
       have_lock = true;
@@ -4680,13 +4669,33 @@  _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
 
     if (!have_lock)
       __libc_lock_unlock (av->mutex);
+
+    __set_errno (err);
   }
   /*
     If the chunk was allocated via mmap, release via munmap().
   */
 
   else {
+
+    /* Preserve errno in case munmap sets it.  */
+    int err = errno;
+
+    /* See if the dynamic brk/mmap threshold needs adjusting.
+       Dumped fake mmapped chunks do not affect the threshold.  */
+    if (!mp_.no_dyn_threshold
+        && chunksize_nomask (p) > mp_.mmap_threshold
+        && chunksize_nomask (p) <= DEFAULT_MMAP_THRESHOLD_MAX)
+      {
+        mp_.mmap_threshold = chunksize (p);
+        mp_.trim_threshold = 2 * mp_.mmap_threshold;
+        LIBC_PROBE (memory_mallopt_free_dyn_thresholds, 2,
+		    mp_.mmap_threshold, mp_.trim_threshold);
+      }
+
     munmap_chunk (p);
+
+    __set_errno (err);
   }
 }