malloc: Simplify _int_free_chunk

Message ID PAWPR08MB8982F5F029936337C0F20A778350A@PAWPR08MB8982.eurprd08.prod.outlook.com (mailing list archive)
State New
Headers
Series malloc: Simplify _int_free_chunk |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
redhat-pt-bot/TryBot-32bit success Build for i686

Commit Message

Wilco Dijkstra April 1, 2026, 8:12 p.m. UTC
  Simplify _int_free_chunk() and always lock if needed.  Use _int_free_merge_chunk()
for cases that assume the arena has been locked  instead.  Move the errno save/
restore to _int_free_maybe_trim().

Passes regress, OK for commit?

---
  

Comments

Rocket Ma April 2, 2026, 6:54 a.m. UTC | #1
>@@ -3472,7 +3472,7 @@ __libc_realloc (void *oldmem, size_t bytes)
> 	  size_t sz = memsize (oldp);
> 	  memcpy (newp, oldmem, sz);
> 	  (void) tag_region (chunk2mem (oldp), sz);
>-          _int_free_chunk (ar_ptr, oldp, chunksize (oldp), 0);
>+          _int_free_chunk (ar_ptr, oldp, chunksize (oldp));
>         }
>     }
> 
>@@ -4549,7 +4545,7 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
> 	      (void) tag_region (oldmem, sz);
> 	      newmem = tag_new_usable (newmem);
> 	      memcpy (newmem, oldmem, sz);
>-	      _int_free_chunk (av, oldp, chunksize (oldp), 1);
>+	      _int_free_merge_chunk (av, oldp, chunksize (oldp));
> 	      check_inuse_chunk (av, newp);
> 	      return newmem;
>             }

Could you add some workaround to realloc to put user requested chunk
into tcache? For remainders, it's OK as user never requested. But for
some expanding scenario, the old chunk is requested by user, thus
put them into tcache may improve performance comparing to put them
in bins?
  
Wilco Dijkstra April 2, 2026, 12:20 p.m. UTC | #2
Hi,
 
>@@ -3472,7 +3472,7 @@ __libc_realloc (void *oldmem, size_t bytes)
>          size_t sz = memsize (oldp);
>          memcpy (newp, oldmem, sz);
>          (void) tag_region (chunk2mem (oldp), sz);
>-          _int_free_chunk (ar_ptr, oldp, chunksize (oldp), 0);
>+          _int_free_chunk (ar_ptr, oldp, chunksize (oldp));
>         }

> Could you add some workaround to realloc to put user requested chunk
> into tcache? For remainders, it's OK as user never requested. But for
> some expanding scenario, the old chunk is requested by user, thus
> put them into tcache may improve performance comparing to put them
> in bins?

It's possible - but note realloc() doesn't use tcache on allocation (and hasn't
historically). In general it seems like a bad idea to keep the arena lock while
doing memcpy (or tagging) of large blocks since that can block other threads.
If we moved the locking into _int_realloc(), we could unlock if we can't reuse
the existing/next block, and then use __libc_malloc/__libc_free so that it can
use tcache in both cases. We could even avoid locking by doing a speculative
check.

Do you see any performance issues with realloc()? There are lots of things
that could be improved - for example it should not split off tiny blocks when
shrinking, and similarly when growing, it should force a minimum percentage
of growth.

Cheers,
Wilco
  
Rocket Ma April 2, 2026, 5:03 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> 于2026年4月2日周四 20:21写道:

> It's possible - but note realloc() doesn't use tcache on allocation (and hasn't
> historically). In general it seems like a bad idea to keep the arena lock while
> doing memcpy (or tagging) of large blocks since that can block other threads.
> If we moved the locking into _int_realloc(), we could unlock if we can't reuse
> the existing/next block, and then use __libc_malloc/__libc_free so that it can
> use tcache in both cases. We could even avoid locking by doing a speculative
> check.
>
> Do you see any performance issues with realloc()? There are lots of things
> that could be improved - for example it should not split off tiny blocks when
> shrinking, and similarly when growing, it should force a minimum percentage
> of growth.

Oh, that's the reason? Actually I didn't notice that realloc won't use
tcache, and I don't expect that memcpy may lead to lock holding. Sorry
for my ignorance.

I don't run into performance issues, I just wonder why realloc don't
use tcache. Thanks for answering.

Cheers,
Rocket
  

Patch

diff --git a/malloc/malloc-check.c b/malloc/malloc-check.c
index 49b623df12ffa7afe376b73b2ad222d357dc6612..d85986b23d1bb0beb5ccc29d380ca96daf8ef41a 100644
--- a/malloc/malloc-check.c
+++ b/malloc/malloc-check.c
@@ -233,10 +233,10 @@  free_check (void *mem)
     }
   else
     {
+      __libc_lock_unlock (main_arena.mutex);
       /* Mark the chunk as belonging to the library again.  */
       (void)tag_region (chunk2mem (p), memsize (p));
-      _int_free_chunk (&main_arena, p, chunksize (p), 1);
-      __libc_lock_unlock (main_arena.mutex);
+      _int_free_chunk (&main_arena, p, chunksize (p));
     }
   __set_errno (err);
 }
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 6a888b0eb7de53ae7b814275e86d2bd2f06b5e53..af2a902320b2f671524ba826fe828ee65bfa9cda 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1024,7 +1024,7 @@  typedef struct malloc_chunk* mchunkptr;
 /* Internal routines.  */
 
 static void*  _int_malloc(mstate, size_t);
-static void _int_free_chunk (mstate, mchunkptr, INTERNAL_SIZE_T, int);
+static void _int_free_chunk (mstate, mchunkptr, INTERNAL_SIZE_T);
 static void _int_free_merge_chunk (mstate, mchunkptr, INTERNAL_SIZE_T);
 static INTERNAL_SIZE_T _int_free_create_chunk (mstate,
 					       mchunkptr, INTERNAL_SIZE_T,
@@ -2403,7 +2403,7 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
 			CHUNK_HDR_SZ | PREV_INUSE);
               set_foot (chunk_at_offset (old_top, old_size), CHUNK_HDR_SZ);
               set_head (old_top, old_size | PREV_INUSE | NON_MAIN_ARENA);
-              _int_free_chunk (av, old_top, chunksize (old_top), 1);
+              _int_free_merge_chunk (av, old_top, chunksize (old_top));
             }
           else
             {
@@ -2674,7 +2674,7 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
                       /* If possible, release the rest. */
                       if (old_size >= MINSIZE)
                         {
-                          _int_free_chunk (av, old_top, chunksize (old_top), 1);
+                          _int_free_merge_chunk (av, old_top, chunksize (old_top));
                         }
                     }
                 }
@@ -3171,12 +3171,12 @@  tcache_thread_shutdown (void)
 	  tcache_tmp->entries[i] = REVEAL_PTR (e->next);
 	  e->key = 0;
 	  p = mem2chunk (e);
-	  _int_free_chunk (arena_for_chunk (p), p, chunksize (p), 0);
+	  _int_free_chunk (arena_for_chunk (p), p, chunksize (p));
 	}
     }
 
   p = mem2chunk (tcache_tmp);
-  _int_free_chunk (arena_for_chunk (p), p, chunksize (p), 0);
+  _int_free_chunk (arena_for_chunk (p), p, chunksize (p));
 }
 
 /* Initialize tcache.  In the rare case there isn't any memory available,
@@ -3350,7 +3350,7 @@  __libc_free (void *mem)
 					  size - MINSIZE)))
     return malloc_printerr_tail ("free(): invalid size");
 
-  _int_free_chunk (arena_for_chunk (p), p, size, 0);
+  _int_free_chunk (arena_for_chunk (p), p, size);
 }
 libc_hidden_def (__libc_free)
 
@@ -3472,7 +3472,7 @@  __libc_realloc (void *oldmem, size_t bytes)
 	  size_t sz = memsize (oldp);
 	  memcpy (newp, oldmem, sz);
 	  (void) tag_region (chunk2mem (oldp), sz);
-          _int_free_chunk (ar_ptr, oldp, chunksize (oldp), 0);
+          _int_free_chunk (ar_ptr, oldp, chunksize (oldp));
         }
     }
 
@@ -4253,11 +4253,10 @@  _int_malloc (mstate av, size_t bytes)
    ------------------------------ free ------------------------------
  */
 
-/* Free chunk P of SIZE bytes to the arena.  HAVE_LOCK indicates where
-   the arena for P has already been locked.  Caller must ensure chunk
-   and size are valid.  */
+/* Free chunk P of SIZE bytes to the arena AV (which is not locked).
+   Caller must ensure chunk and size are valid.  */
 static void
-_int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
+_int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size)
 {
   /*
     Consolidate other non-mmapped chunks as they arrive.
@@ -4265,22 +4264,14 @@  _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
 
   if (!chunk_is_mmapped(p)) {
 
-    /* Preserve errno in case block merging results in munmap.  */
-    int err = errno;
-
-    /* If we're single-threaded, don't lock the arena.  */
     if (SINGLE_THREAD_P)
-      have_lock = true;
-
-    if (!have_lock)
-      __libc_lock_lock (av->mutex);
-
-    _int_free_merge_chunk (av, p, size);
-
-    if (!have_lock)
-      __libc_lock_unlock (av->mutex);
-
-    __set_errno (err);
+      _int_free_merge_chunk (av, p, size);
+    else
+      {
+	__libc_lock_lock (av->mutex);
+	_int_free_merge_chunk (av, p, size);
+	__libc_lock_unlock (av->mutex);
+      }
   }
   /*
     If the chunk was allocated via mmap, release via munmap().
@@ -4309,8 +4300,8 @@  _int_free_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, int have_lock)
   }
 }
 
-/* Try to merge chunk P of SIZE bytes with its neighbors.  Put the
-   resulting chunk on the appropriate bin list.  P must not be on a
+/* Try to merge chunk P of SIZE bytes from locked arena AV with its neighbors.
+   Put the resulting chunk on the appropriate bin list.  P must not be on a
    bin list yet, and it can be in use.  */
 static void
 _int_free_merge_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size)
@@ -4440,6 +4431,9 @@  _int_free_maybe_trim (mstate av, INTERNAL_SIZE_T size)
      if ATTEMPT_TRIMMING_THRESHOLD is reached.  */
   if (size >= ATTEMPT_TRIMMING_THRESHOLD)
     {
+      /* Preserve errno.  */
+      int err = errno;
+
       if (av == &main_arena)
 	{
 #ifndef MORECORE_CANNOT_TRIM
@@ -4456,6 +4450,8 @@  _int_free_maybe_trim (mstate av, INTERNAL_SIZE_T size)
 	  assert (heap->ar_ptr == av);
 	  heap_trim (heap, mp_.top_pad);
 	}
+
+      __set_errno (err);
     }
 }
 
@@ -4549,7 +4545,7 @@  _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
 	      (void) tag_region (oldmem, sz);
 	      newmem = tag_new_usable (newmem);
 	      memcpy (newmem, oldmem, sz);
-	      _int_free_chunk (av, oldp, chunksize (oldp), 1);
+	      _int_free_merge_chunk (av, oldp, chunksize (oldp));
 	      check_inuse_chunk (av, newp);
 	      return newmem;
             }
@@ -4577,7 +4573,7 @@  _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
                 (av != &main_arena ? NON_MAIN_ARENA : 0));
       /* Mark remainder as inuse so free() won't complain */
       set_inuse_bit_at_offset (remainder, remainder_size);
-      _int_free_chunk (av, remainder, chunksize (remainder), 1);
+      _int_free_merge_chunk (av, remainder, chunksize (remainder));
     }
 
   check_inuse_chunk (av, newp);