[v4,3/3] malloc: Add tcache path for calloc

Message ID 20241126073340.3724382-4-wangyang.guo@intel.com
State New
Headers
Series malloc: TCACHE improvement for free and calloc |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed

Commit Message

Wangyang Guo Nov. 26, 2024, 7:33 a.m. UTC
  This commit add tcache support in calloc() which can largely improve
the performance of small size allocation, especially in multi-thread
scenario. clear_mem() and tcache_available() is split out as a helper
function for better reusing the code.

Also fix tst-safe-linking failure after enabling tcache. In previous,
calloc() is used as a way to by-pass tcache in memory allocation and
trigger safe-linking check in fastbins path. With tcache enabled, it
needs extra workarounds to bypass tcache.

Result of bench-malloc-thread benchmark

Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)

Threads#   | Ratio
-----------|------
1 thread   | 0.724
4 threads  | 0.534

---
Changes in v3:
- Split out tcache_available() as helper function.
- Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
Changes in v2:
- Merge tst-safe-linking fix to make sure CI check pass.
- Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
---
 malloc/malloc.c           | 129 ++++++++++++++++++++++++--------------
 malloc/tst-safe-linking.c |  81 ++++++++++++++++++++----
 2 files changed, 150 insertions(+), 60 deletions(-)
  

Comments

H.J. Lu Nov. 26, 2024, 9:08 a.m. UTC | #1
On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> This commit add tcache support in calloc() which can largely improve
> the performance of small size allocation, especially in multi-thread
> scenario. clear_mem() and tcache_available() is split out as a helper
> function for better reusing the code.
>
> Also fix tst-safe-linking failure after enabling tcache. In previous,
> calloc() is used as a way to by-pass tcache in memory allocation and
> trigger safe-linking check in fastbins path. With tcache enabled, it
> needs extra workarounds to bypass tcache.
>
> Result of bench-malloc-thread benchmark
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads#   | Ratio
> -----------|------
> 1 thread   | 0.724
> 4 threads  | 0.534
>
> ---
> Changes in v3:
> - Split out tcache_available() as helper function.
> - Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
> Changes in v2:
> - Merge tst-safe-linking fix to make sure CI check pass.
> - Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
> ---
>  malloc/malloc.c           | 129 ++++++++++++++++++++++++--------------
>  malloc/tst-safe-linking.c |  81 ++++++++++++++++++++----
>  2 files changed, 150 insertions(+), 60 deletions(-)
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 81ddd2c3a8..1437ec20fb 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3208,6 +3208,18 @@ tcache_next (tcache_entry *e)
>    return (tcache_entry *) REVEAL_PTR (e->next);
>  }
>
> +/* Check if tcache is available for alloc by corresponding tc_idx.  */
> +static __always_inline bool
> +tcache_availabe (size_t tc_idx)
> +{
> +  if (tc_idx < mp_.tcache_bins
> +      && tcache != NULL
> +      && tcache->counts[tc_idx] > 0)
> +    return true;
> +  else
> +    return false;
> +}
> +
>  /* Verify if the suspicious tcache_entry is double free.
>     It's not expected to execute very often, mark it as noinline.  */
>  static __attribute__ ((noinline)) void
> @@ -3366,9 +3378,7 @@ __libc_malloc (size_t bytes)
>    MAYBE_INIT_TCACHE ();
>
>    DIAG_PUSH_NEEDS_COMMENT;
> -  if (tc_idx < mp_.tcache_bins
> -      && tcache != NULL
> -      && tcache->counts[tc_idx] > 0)
> +  if (tcache_availabe (tc_idx))
>      {
>        victim = tcache_get (tc_idx);
>        return tag_new_usable (victim);
> @@ -3667,9 +3677,7 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        }
>      size_t tc_idx = csize2tidx (tbytes);
>
> -    if (tc_idx < mp_.tcache_bins
> -       && tcache != NULL
> -       && tcache->counts[tc_idx] > 0)
> +    if (tcache_availabe (tc_idx))
>        {
>         /* The tcache itself isn't encoded, but the chain is.  */
>         tcache_entry **tep = & tcache->entries[tc_idx];
> @@ -3747,16 +3755,55 @@ __libc_pvalloc (size_t bytes)
>    return _mid_memalign (pagesize, rounded_bytes, address);
>  }
>
> +static __always_inline void *
> +clear_mem (void *mem, INTERNAL_SIZE_T csz)
> +{
> +  INTERNAL_SIZE_T *d;
> +  unsigned long clearsize, nclears;
> +
> +  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> +     contents have an odd number of INTERNAL_SIZE_T-sized words;
> +     minimally 3.  */
> +  d = (INTERNAL_SIZE_T *) mem;
> +  clearsize = csz - SIZE_SZ;
> +  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +  assert (nclears >= 3);
> +
> +  if (nclears > 9)
> +    return memset (d, 0, clearsize);
> +
> +  else
> +    {
> +      *(d + 0) = 0;
> +      *(d + 1) = 0;
> +      *(d + 2) = 0;
> +      if (nclears > 4)
> +        {
> +          *(d + 3) = 0;
> +          *(d + 4) = 0;
> +          if (nclears > 6)
> +            {
> +              *(d + 5) = 0;
> +              *(d + 6) = 0;
> +              if (nclears > 8)
> +                {
> +                  *(d + 7) = 0;
> +                  *(d + 8) = 0;
> +                }
> +            }
> +        }
> +    }
> +
> +  return mem;
> +}
> +
>  void *
>  __libc_calloc (size_t n, size_t elem_size)
>  {
>    mstate av;
> -  mchunkptr oldtop;
> -  INTERNAL_SIZE_T sz, oldtopsize;
> +  mchunkptr oldtop, p;
> +  INTERNAL_SIZE_T sz, oldtopsize, csz;
>    void *mem;
> -  unsigned long clearsize;
> -  unsigned long nclears;
> -  INTERNAL_SIZE_T *d;
>    ptrdiff_t bytes;
>
>    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3772,6 +3819,27 @@ __libc_calloc (size_t n, size_t elem_size)
>
>    MAYBE_INIT_TCACHE ();
>
> +#if USE_TCACHE
> +  /* int_free also calls request2size, be careful to not pad twice.  */
> +  size_t tbytes = checked_request2size (bytes);
> +  if (tbytes == 0)
> +    {
> +      __set_errno (ENOMEM);
> +      return NULL;
> +    }
> +  size_t tc_idx = csize2tidx (tbytes);
> +
> +  if (tcache_availabe (tc_idx))
> +    {
> +      mem = tcache_get (tc_idx);
> +      p = mem2chunk (mem);
> +      if (__glibc_unlikely (mtag_enabled))
> +       return tag_new_zero_region (mem, memsize (p));
> +      csz = chunksize (p);
> +      return clear_mem (mem, csz);
> +    }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      av = &main_arena;
>    else
> @@ -3826,7 +3894,7 @@ __libc_calloc (size_t n, size_t elem_size)
>    if (mem == NULL)
>      return NULL;
>
> -  mchunkptr p = mem2chunk (mem);
> +  p = mem2chunk (mem);
>
>    /* If we are using memory tagging, then we need to set the tags
>       regardless of MORECORE_CLEARS, so we zero the whole block while
> @@ -3834,7 +3902,7 @@ __libc_calloc (size_t n, size_t elem_size)
>    if (__glibc_unlikely (mtag_enabled))
>      return tag_new_zero_region (mem, memsize (p));
>
> -  INTERNAL_SIZE_T csz = chunksize (p);
> +  csz = chunksize (p);
>
>    /* Two optional cases in which clearing not necessary */
>    if (chunk_is_mmapped (p))
> @@ -3853,40 +3921,7 @@ __libc_calloc (size_t n, size_t elem_size)
>      }
>  #endif
>
> -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> -     minimally 3.  */
> -  d = (INTERNAL_SIZE_T *) mem;
> -  clearsize = csz - SIZE_SZ;
> -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> -  assert (nclears >= 3);
> -
> -  if (nclears > 9)
> -    return memset (d, 0, clearsize);
> -
> -  else
> -    {
> -      *(d + 0) = 0;
> -      *(d + 1) = 0;
> -      *(d + 2) = 0;
> -      if (nclears > 4)
> -        {
> -          *(d + 3) = 0;
> -          *(d + 4) = 0;
> -          if (nclears > 6)
> -            {
> -              *(d + 5) = 0;
> -              *(d + 6) = 0;
> -              if (nclears > 8)
> -                {
> -                  *(d + 7) = 0;
> -                  *(d + 8) = 0;
> -                }
> -            }
> -        }
> -    }
> -
> -  return mem;
> +  return clear_mem (mem, csz);
>  }
>  #endif /* IS_IN (libc) */
>
> diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
> index 01dd07004d..5302575ad1 100644
> --- a/malloc/tst-safe-linking.c
> +++ b/malloc/tst-safe-linking.c
> @@ -111,22 +111,37 @@ test_fastbin (void *closure)
>    int i;
>    int mask = ((int *)closure)[0];
>    size_t size = TCACHE_ALLOC_SIZE;
> +  void * ps[TCACHE_FILL_COUNT];
> +  void * pps[TCACHE_FILL_COUNT];
>
>    printf ("++ fastbin ++\n");
>
> +  /* Populate the fastbin list.  */
> +  void * volatile a = calloc (1, size);
> +  void * volatile b = calloc (1, size);
> +  void * volatile c = calloc (1, size);
> +  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
> +  /* Chunks for later tcache filling from fastbins.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      void * volatile p = calloc (1, size);
> +      pps[i] = p;
> +    }
> +
>    /* Take the tcache out of the game.  */
>    for (i = 0; i < TCACHE_FILL_COUNT; ++i)
>      {
>        void * volatile p = calloc (1, size);
> -      printf ("p=%p\n", p);
> -      free (p);
> +      ps[i] = p;
>      }
>
> -  /* Populate the fastbin list.  */
> -  void * volatile a = calloc (1, size);
> -  void * volatile b = calloc (1, size);
> -  void * volatile c = calloc (1, size);
> -  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (ps[i]);
> +    }
> +
> +  /* Free abc will return to fastbin in FIFO order.  */
>    free (a);
>    free (b);
>    free (c);
> @@ -136,11 +151,43 @@ test_fastbin (void *closure)
>    memset (c, mask & 0xFF, size);
>    printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
>
> +  /* Filling fastbins, will be copied to tcache later.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (pps[i]);
> +    }
> +
> +  /* Drain out tcache to make sure later alloc from fastbins.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      void * volatile p = calloc (1, size);
> +      ps[i] = p;
> +    }
> +
> +  /* This line will also filling tcache with remain pps and c.  */
> +  pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
> +
> +  /* Tcache is FILO, now the first one is c, take it out.  */
>    c = calloc (1, size);
>    printf ("Allocated: c=%p\n", c);
> +
> +  /* Drain out remain pps from tcache.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
> +    {
> +      void * volatile p = calloc (1, size);
> +      pps[i] = p;
> +    }
> +
>    /* This line will trigger the Safe-Linking check.  */
>    b = calloc (1, size);
>    printf ("b=%p\n", b);
> +
> +  /* Free previous pointers. */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (ps[i]);
> +      free (pps[i]);
> +    }
>  }
>
>  /* Try corrupting the fastbin list and trigger a consolidate.  */
> @@ -150,21 +197,29 @@ test_fastbin_consolidate (void *closure)
>    int i;
>    int mask = ((int*)closure)[0];
>    size_t size = TCACHE_ALLOC_SIZE;
> +  void * ps[TCACHE_FILL_COUNT];
>
>    printf ("++ fastbin consolidate ++\n");
>
> +  /* Populate the fastbin list.  */
> +  void * volatile a = calloc (1, size);
> +  void * volatile b = calloc (1, size);
> +  void * volatile c = calloc (1, size);
> +  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
>    /* Take the tcache out of the game.  */
>    for (i = 0; i < TCACHE_FILL_COUNT; ++i)
>      {
>        void * volatile p = calloc (1, size);
> -      free (p);
> +      ps[i] = p;
>      }
>
> -  /* Populate the fastbin list.  */
> -  void * volatile a = calloc (1, size);
> -  void * volatile b = calloc (1, size);
> -  void * volatile c = calloc (1, size);
> -  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (ps[i]);
> +    }
> +
> +  /* Free abc will return to fastbin.  */
>    free (a);
>    free (b);
>    free (c);
> --
> 2.43.5
>

Since you are working on calloc, please try this patch to see if
it improves performance on x86-64.

Thanks.
  
Wangyang Guo Nov. 26, 2024, 9:39 a.m. UTC | #2
On 11/26/2024 5:08 PM, H.J. Lu wrote:

> On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>> This commit add tcache support in calloc() which can largely improve
>> the performance of small size allocation, especially in multi-thread
>> scenario. clear_mem() and tcache_available() is split out as a helper
>> function for better reusing the code.
>>
>> Also fix tst-safe-linking failure after enabling tcache. In previous,
>> calloc() is used as a way to by-pass tcache in memory allocation and
>> trigger safe-linking check in fastbins path. With tcache enabled, it
>> needs extra workarounds to bypass tcache.
>>
>> Result of bench-malloc-thread benchmark
>>
>> Test Platform: Xeon-8380
>> Bench Function: calloc
>> Ratio: New / Original time_per_iteration (Lower is Better)
>>
>> Threads#   | Ratio
>> -----------|------
>> 1 thread   | 0.724
>> 4 threads  | 0.534
>>
> Since you are working on calloc, please try this patch to see if
> it improves performance on x86-64.
>
> Thanks.

Look like the change is within variation. For bench-malloc-thread 
benchmark, the cycles spent in this area is not very hot.

Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)

Threads#   | Ratio
-----------|------
1 thread   | 0.993
4 threads  | 0.996
  
H.J. Lu Nov. 26, 2024, 9:58 p.m. UTC | #3
On Tue, Nov 26, 2024 at 5:39 PM Guo, Wangyang <wangyang.guo@intel.com> wrote:
>
> On 11/26/2024 5:08 PM, H.J. Lu wrote:
>
> > On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
> >> This commit add tcache support in calloc() which can largely improve
> >> the performance of small size allocation, especially in multi-thread
> >> scenario. clear_mem() and tcache_available() is split out as a helper
> >> function for better reusing the code.
> >>
> >> Also fix tst-safe-linking failure after enabling tcache. In previous,
> >> calloc() is used as a way to by-pass tcache in memory allocation and
> >> trigger safe-linking check in fastbins path. With tcache enabled, it
> >> needs extra workarounds to bypass tcache.
> >>
> >> Result of bench-malloc-thread benchmark
> >>
> >> Test Platform: Xeon-8380
> >> Bench Function: calloc
> >> Ratio: New / Original time_per_iteration (Lower is Better)
> >>
> >> Threads#   | Ratio
> >> -----------|------
> >> 1 thread   | 0.724
> >> 4 threads  | 0.534
> >>
> > Since you are working on calloc, please try this patch to see if
> > it improves performance on x86-64.
> >
> > Thanks.
>
> Look like the change is within variation. For bench-malloc-thread
> benchmark, the cycles spent in this area is not very hot.
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads#   | Ratio
> -----------|------
> 1 thread   | 0.993
> 4 threads  | 0.996
>

This patch reduces the number of branches from 3 to 1.  How
does it perform?
  
Wangyang Guo Nov. 27, 2024, 12:51 a.m. UTC | #4
On 11/27/2024 5:58 AM, H.J. Lu wrote:

> On Tue, Nov 26, 2024 at 5:39 PM Guo, Wangyang <wangyang.guo@intel.com> wrote:
>> On 11/26/2024 5:08 PM, H.J. Lu wrote:
>>
>>> On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>>>> This commit add tcache support in calloc() which can largely improve
>>>> the performance of small size allocation, especially in multi-thread
>>>> scenario. clear_mem() and tcache_available() is split out as a helper
>>>> function for better reusing the code.
>>>>
>>>> Also fix tst-safe-linking failure after enabling tcache. In previous,
>>>> calloc() is used as a way to by-pass tcache in memory allocation and
>>>> trigger safe-linking check in fastbins path. With tcache enabled, it
>>>> needs extra workarounds to bypass tcache.
>>>>
>>>> Result of bench-malloc-thread benchmark
>>>>
>>>> Test Platform: Xeon-8380
>>>> Bench Function: calloc
>>>> Ratio: New / Original time_per_iteration (Lower is Better)
>>>>
>>>> Threads#   | Ratio
>>>> -----------|------
>>>> 1 thread   | 0.724
>>>> 4 threads  | 0.534
>>>>
>>> Since you are working on calloc, please try this patch to see if
>>> it improves performance on x86-64.
>>>
>>> Thanks.
>> Look like the change is within variation. For bench-malloc-thread
>> benchmark, the cycles spent in this area is not very hot.
>>
>> Test Platform: Xeon-8380
>> Bench Function: calloc
>> Ratio: New / Original time_per_iteration (Lower is Better)
>>
>> Threads#   | Ratio
>> -----------|------
>> 1 thread   | 0.993
>> 4 threads  | 0.996
>>
> This patch reduces the number of branches from 3 to 1.  How
> does it perform?

The patch can have significant performance gain in bench-malloc-thread:

Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)

Threads#   | Ratio
-----------|------
1 thread   | 0.953
4 threads  | 0.952
  
H.J. Lu Nov. 27, 2024, 1:17 a.m. UTC | #5
On Wed, Nov 27, 2024, 8:51 AM Guo, Wangyang <wangyang.guo@intel.com> wrote:

> On 11/27/2024 5:58 AM, H.J. Lu wrote:
>
> > On Tue, Nov 26, 2024 at 5:39 PM Guo, Wangyang <wangyang.guo@intel.com>
> wrote:
> >> On 11/26/2024 5:08 PM, H.J. Lu wrote:
> >>
> >>> On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com>
> wrote:
> >>>> This commit add tcache support in calloc() which can largely improve
> >>>> the performance of small size allocation, especially in multi-thread
> >>>> scenario. clear_mem() and tcache_available() is split out as a helper
> >>>> function for better reusing the code.
> >>>>
> >>>> Also fix tst-safe-linking failure after enabling tcache. In previous,
> >>>> calloc() is used as a way to by-pass tcache in memory allocation and
> >>>> trigger safe-linking check in fastbins path. With tcache enabled, it
> >>>> needs extra workarounds to bypass tcache.
> >>>>
> >>>> Result of bench-malloc-thread benchmark
> >>>>
> >>>> Test Platform: Xeon-8380
> >>>> Bench Function: calloc
> >>>> Ratio: New / Original time_per_iteration (Lower is Better)
> >>>>
> >>>> Threads#   | Ratio
> >>>> -----------|------
> >>>> 1 thread   | 0.724
> >>>> 4 threads  | 0.534
> >>>>
> >>> Since you are working on calloc, please try this patch to see if
> >>> it improves performance on x86-64.
> >>>
> >>> Thanks.
> >> Look like the change is within variation. For bench-malloc-thread
> >> benchmark, the cycles spent in this area is not very hot.
> >>
> >> Test Platform: Xeon-8380
> >> Bench Function: calloc
> >> Ratio: New / Original time_per_iteration (Lower is Better)
> >>
> >> Threads#   | Ratio
> >> -----------|------
> >> 1 thread   | 0.993
> >> 4 threads  | 0.996
> >>
> > This patch reduces the number of branches from 3 to 1.  How
> > does it perform?
>
> The patch can have significant performance gain in bench-malloc-thread:
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads#   | Ratio
> -----------|------
> 1 thread   | 0.953
> 4 threads  | 0.952
>

Great.  I will submit it later today.


>
> H.J.
  
H.J. Lu Dec. 3, 2024, 9:39 p.m. UTC | #6
On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> This commit add tcache support in calloc() which can largely improve
> the performance of small size allocation, especially in multi-thread
> scenario. clear_mem() and tcache_available() is split out as a helper
> function for better reusing the code.
>
> Also fix tst-safe-linking failure after enabling tcache. In previous,
> calloc() is used as a way to by-pass tcache in memory allocation and
> trigger safe-linking check in fastbins path. With tcache enabled, it
> needs extra workarounds to bypass tcache.
>
> Result of bench-malloc-thread benchmark
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads#   | Ratio
> -----------|------
> 1 thread   | 0.724
> 4 threads  | 0.534

Please rebase.

> ---
> Changes in v3:
> - Split out tcache_available() as helper function.
> - Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
> Changes in v2:
> - Merge tst-safe-linking fix to make sure CI check pass.
> - Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
> ---
>  malloc/malloc.c           | 129 ++++++++++++++++++++++++--------------
>  malloc/tst-safe-linking.c |  81 ++++++++++++++++++++----
>  2 files changed, 150 insertions(+), 60 deletions(-)
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 81ddd2c3a8..1437ec20fb 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3208,6 +3208,18 @@ tcache_next (tcache_entry *e)
>    return (tcache_entry *) REVEAL_PTR (e->next);
>  }
>
> +/* Check if tcache is available for alloc by corresponding tc_idx.  */
> +static __always_inline bool
> +tcache_availabe (size_t tc_idx)
> +{
> +  if (tc_idx < mp_.tcache_bins
> +      && tcache != NULL
> +      && tcache->counts[tc_idx] > 0)
> +    return true;
> +  else
> +    return false;
> +}
> +
>  /* Verify if the suspicious tcache_entry is double free.
>     It's not expected to execute very often, mark it as noinline.  */
>  static __attribute__ ((noinline)) void
> @@ -3366,9 +3378,7 @@ __libc_malloc (size_t bytes)
>    MAYBE_INIT_TCACHE ();
>
>    DIAG_PUSH_NEEDS_COMMENT;
> -  if (tc_idx < mp_.tcache_bins
> -      && tcache != NULL
> -      && tcache->counts[tc_idx] > 0)
> +  if (tcache_availabe (tc_idx))
>      {
>        victim = tcache_get (tc_idx);
>        return tag_new_usable (victim);
> @@ -3667,9 +3677,7 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        }
>      size_t tc_idx = csize2tidx (tbytes);
>
> -    if (tc_idx < mp_.tcache_bins
> -       && tcache != NULL
> -       && tcache->counts[tc_idx] > 0)
> +    if (tcache_availabe (tc_idx))
>        {
>         /* The tcache itself isn't encoded, but the chain is.  */
>         tcache_entry **tep = & tcache->entries[tc_idx];
> @@ -3747,16 +3755,55 @@ __libc_pvalloc (size_t bytes)
>    return _mid_memalign (pagesize, rounded_bytes, address);
>  }
>
> +static __always_inline void *
> +clear_mem (void *mem, INTERNAL_SIZE_T csz)
> +{
> +  INTERNAL_SIZE_T *d;
> +  unsigned long clearsize, nclears;
> +
> +  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> +     contents have an odd number of INTERNAL_SIZE_T-sized words;
> +     minimally 3.  */
> +  d = (INTERNAL_SIZE_T *) mem;
> +  clearsize = csz - SIZE_SZ;
> +  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +  assert (nclears >= 3);
> +
> +  if (nclears > 9)
> +    return memset (d, 0, clearsize);
> +
> +  else
> +    {
> +      *(d + 0) = 0;
> +      *(d + 1) = 0;
> +      *(d + 2) = 0;
> +      if (nclears > 4)
> +        {
> +          *(d + 3) = 0;
> +          *(d + 4) = 0;
> +          if (nclears > 6)
> +            {
> +              *(d + 5) = 0;
> +              *(d + 6) = 0;
> +              if (nclears > 8)
> +                {
> +                  *(d + 7) = 0;
> +                  *(d + 8) = 0;
> +                }
> +            }
> +        }
> +    }
> +
> +  return mem;
> +}
> +
>  void *
>  __libc_calloc (size_t n, size_t elem_size)
>  {
>    mstate av;
> -  mchunkptr oldtop;
> -  INTERNAL_SIZE_T sz, oldtopsize;
> +  mchunkptr oldtop, p;
> +  INTERNAL_SIZE_T sz, oldtopsize, csz;
>    void *mem;
> -  unsigned long clearsize;
> -  unsigned long nclears;
> -  INTERNAL_SIZE_T *d;
>    ptrdiff_t bytes;
>
>    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3772,6 +3819,27 @@ __libc_calloc (size_t n, size_t elem_size)
>
>    MAYBE_INIT_TCACHE ();
>
> +#if USE_TCACHE
> +  /* int_free also calls request2size, be careful to not pad twice.  */
> +  size_t tbytes = checked_request2size (bytes);
> +  if (tbytes == 0)
> +    {
> +      __set_errno (ENOMEM);
> +      return NULL;
> +    }
> +  size_t tc_idx = csize2tidx (tbytes);
> +
> +  if (tcache_availabe (tc_idx))
> +    {
> +      mem = tcache_get (tc_idx);
> +      p = mem2chunk (mem);
> +      if (__glibc_unlikely (mtag_enabled))
> +       return tag_new_zero_region (mem, memsize (p));
> +      csz = chunksize (p);
> +      return clear_mem (mem, csz);
> +    }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      av = &main_arena;
>    else
> @@ -3826,7 +3894,7 @@ __libc_calloc (size_t n, size_t elem_size)
>    if (mem == NULL)
>      return NULL;
>
> -  mchunkptr p = mem2chunk (mem);
> +  p = mem2chunk (mem);
>
>    /* If we are using memory tagging, then we need to set the tags
>       regardless of MORECORE_CLEARS, so we zero the whole block while
> @@ -3834,7 +3902,7 @@ __libc_calloc (size_t n, size_t elem_size)
>    if (__glibc_unlikely (mtag_enabled))
>      return tag_new_zero_region (mem, memsize (p));
>
> -  INTERNAL_SIZE_T csz = chunksize (p);
> +  csz = chunksize (p);
>
>    /* Two optional cases in which clearing not necessary */
>    if (chunk_is_mmapped (p))
> @@ -3853,40 +3921,7 @@ __libc_calloc (size_t n, size_t elem_size)
>      }
>  #endif
>
> -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> -     minimally 3.  */
> -  d = (INTERNAL_SIZE_T *) mem;
> -  clearsize = csz - SIZE_SZ;
> -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> -  assert (nclears >= 3);
> -
> -  if (nclears > 9)
> -    return memset (d, 0, clearsize);
> -
> -  else
> -    {
> -      *(d + 0) = 0;
> -      *(d + 1) = 0;
> -      *(d + 2) = 0;
> -      if (nclears > 4)
> -        {
> -          *(d + 3) = 0;
> -          *(d + 4) = 0;
> -          if (nclears > 6)
> -            {
> -              *(d + 5) = 0;
> -              *(d + 6) = 0;
> -              if (nclears > 8)
> -                {
> -                  *(d + 7) = 0;
> -                  *(d + 8) = 0;
> -                }
> -            }
> -        }
> -    }
> -
> -  return mem;
> +  return clear_mem (mem, csz);
>  }
>  #endif /* IS_IN (libc) */
>
> diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
> index 01dd07004d..5302575ad1 100644
> --- a/malloc/tst-safe-linking.c
> +++ b/malloc/tst-safe-linking.c
> @@ -111,22 +111,37 @@ test_fastbin (void *closure)
>    int i;
>    int mask = ((int *)closure)[0];
>    size_t size = TCACHE_ALLOC_SIZE;
> +  void * ps[TCACHE_FILL_COUNT];
> +  void * pps[TCACHE_FILL_COUNT];
>
>    printf ("++ fastbin ++\n");
>
> +  /* Populate the fastbin list.  */
> +  void * volatile a = calloc (1, size);
> +  void * volatile b = calloc (1, size);
> +  void * volatile c = calloc (1, size);
> +  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
> +  /* Chunks for later tcache filling from fastbins.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      void * volatile p = calloc (1, size);
> +      pps[i] = p;
> +    }
> +
>    /* Take the tcache out of the game.  */
>    for (i = 0; i < TCACHE_FILL_COUNT; ++i)
>      {
>        void * volatile p = calloc (1, size);
> -      printf ("p=%p\n", p);
> -      free (p);
> +      ps[i] = p;
>      }
>
> -  /* Populate the fastbin list.  */
> -  void * volatile a = calloc (1, size);
> -  void * volatile b = calloc (1, size);
> -  void * volatile c = calloc (1, size);
> -  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (ps[i]);
> +    }
> +
> +  /* Free abc will return to fastbin in FIFO order.  */
>    free (a);
>    free (b);
>    free (c);
> @@ -136,11 +151,43 @@ test_fastbin (void *closure)
>    memset (c, mask & 0xFF, size);
>    printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
>
> +  /* Filling fastbins, will be copied to tcache later.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (pps[i]);
> +    }
> +
> +  /* Drain out tcache to make sure later alloc from fastbins.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      void * volatile p = calloc (1, size);
> +      ps[i] = p;
> +    }
> +
> +  /* This line will also filling tcache with remain pps and c.  */
> +  pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
> +
> +  /* Tcache is FILO, now the first one is c, take it out.  */
>    c = calloc (1, size);
>    printf ("Allocated: c=%p\n", c);
> +
> +  /* Drain out remain pps from tcache.  */
> +  for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
> +    {
> +      void * volatile p = calloc (1, size);
> +      pps[i] = p;
> +    }
> +
>    /* This line will trigger the Safe-Linking check.  */
>    b = calloc (1, size);
>    printf ("b=%p\n", b);
> +
> +  /* Free previous pointers. */
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (ps[i]);
> +      free (pps[i]);
> +    }
>  }
>
>  /* Try corrupting the fastbin list and trigger a consolidate.  */
> @@ -150,21 +197,29 @@ test_fastbin_consolidate (void *closure)
>    int i;
>    int mask = ((int*)closure)[0];
>    size_t size = TCACHE_ALLOC_SIZE;
> +  void * ps[TCACHE_FILL_COUNT];
>
>    printf ("++ fastbin consolidate ++\n");
>
> +  /* Populate the fastbin list.  */
> +  void * volatile a = calloc (1, size);
> +  void * volatile b = calloc (1, size);
> +  void * volatile c = calloc (1, size);
> +  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
>    /* Take the tcache out of the game.  */
>    for (i = 0; i < TCACHE_FILL_COUNT; ++i)
>      {
>        void * volatile p = calloc (1, size);
> -      free (p);
> +      ps[i] = p;
>      }
>
> -  /* Populate the fastbin list.  */
> -  void * volatile a = calloc (1, size);
> -  void * volatile b = calloc (1, size);
> -  void * volatile c = calloc (1, size);
> -  printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> +    {
> +      free (ps[i]);
> +    }
> +
> +  /* Free abc will return to fastbin.  */
>    free (a);
>    free (b);
>    free (c);
> --
> 2.43.5
>
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 81ddd2c3a8..1437ec20fb 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3208,6 +3208,18 @@  tcache_next (tcache_entry *e)
   return (tcache_entry *) REVEAL_PTR (e->next);
 }
 
+/* Check if tcache is available for alloc by corresponding tc_idx.  */
+static __always_inline bool
+tcache_availabe (size_t tc_idx)
+{
+  if (tc_idx < mp_.tcache_bins
+      && tcache != NULL
+      && tcache->counts[tc_idx] > 0)
+    return true;
+  else
+    return false;
+}
+
 /* Verify if the suspicious tcache_entry is double free.
    It's not expected to execute very often, mark it as noinline.  */
 static __attribute__ ((noinline)) void
@@ -3366,9 +3378,7 @@  __libc_malloc (size_t bytes)
   MAYBE_INIT_TCACHE ();
 
   DIAG_PUSH_NEEDS_COMMENT;
-  if (tc_idx < mp_.tcache_bins
-      && tcache != NULL
-      && tcache->counts[tc_idx] > 0)
+  if (tcache_availabe (tc_idx))
     {
       victim = tcache_get (tc_idx);
       return tag_new_usable (victim);
@@ -3667,9 +3677,7 @@  _mid_memalign (size_t alignment, size_t bytes, void *address)
       }
     size_t tc_idx = csize2tidx (tbytes);
 
-    if (tc_idx < mp_.tcache_bins
-	&& tcache != NULL
-	&& tcache->counts[tc_idx] > 0)
+    if (tcache_availabe (tc_idx))
       {
 	/* The tcache itself isn't encoded, but the chain is.  */
 	tcache_entry **tep = & tcache->entries[tc_idx];
@@ -3747,16 +3755,55 @@  __libc_pvalloc (size_t bytes)
   return _mid_memalign (pagesize, rounded_bytes, address);
 }
 
+static __always_inline void *
+clear_mem (void *mem, INTERNAL_SIZE_T csz)
+{
+  INTERNAL_SIZE_T *d;
+  unsigned long clearsize, nclears;
+
+  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
+     contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3.  */
+  d = (INTERNAL_SIZE_T *) mem;
+  clearsize = csz - SIZE_SZ;
+  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+  assert (nclears >= 3);
+
+  if (nclears > 9)
+    return memset (d, 0, clearsize);
+
+  else
+    {
+      *(d + 0) = 0;
+      *(d + 1) = 0;
+      *(d + 2) = 0;
+      if (nclears > 4)
+        {
+          *(d + 3) = 0;
+          *(d + 4) = 0;
+          if (nclears > 6)
+            {
+              *(d + 5) = 0;
+              *(d + 6) = 0;
+              if (nclears > 8)
+                {
+                  *(d + 7) = 0;
+                  *(d + 8) = 0;
+                }
+            }
+        }
+    }
+
+  return mem;
+}
+
 void *
 __libc_calloc (size_t n, size_t elem_size)
 {
   mstate av;
-  mchunkptr oldtop;
-  INTERNAL_SIZE_T sz, oldtopsize;
+  mchunkptr oldtop, p;
+  INTERNAL_SIZE_T sz, oldtopsize, csz;
   void *mem;
-  unsigned long clearsize;
-  unsigned long nclears;
-  INTERNAL_SIZE_T *d;
   ptrdiff_t bytes;
 
   if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3772,6 +3819,27 @@  __libc_calloc (size_t n, size_t elem_size)
 
   MAYBE_INIT_TCACHE ();
 
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = checked_request2size (bytes);
+  if (tbytes == 0)
+    {
+      __set_errno (ENOMEM);
+      return NULL;
+    }
+  size_t tc_idx = csize2tidx (tbytes);
+
+  if (tcache_availabe (tc_idx))
+    {
+      mem = tcache_get (tc_idx);
+      p = mem2chunk (mem);
+      if (__glibc_unlikely (mtag_enabled))
+	return tag_new_zero_region (mem, memsize (p));
+      csz = chunksize (p);
+      return clear_mem (mem, csz);
+    }
+#endif
+
   if (SINGLE_THREAD_P)
     av = &main_arena;
   else
@@ -3826,7 +3894,7 @@  __libc_calloc (size_t n, size_t elem_size)
   if (mem == NULL)
     return NULL;
 
-  mchunkptr p = mem2chunk (mem);
+  p = mem2chunk (mem);
 
   /* If we are using memory tagging, then we need to set the tags
      regardless of MORECORE_CLEARS, so we zero the whole block while
@@ -3834,7 +3902,7 @@  __libc_calloc (size_t n, size_t elem_size)
   if (__glibc_unlikely (mtag_enabled))
     return tag_new_zero_region (mem, memsize (p));
 
-  INTERNAL_SIZE_T csz = chunksize (p);
+  csz = chunksize (p);
 
   /* Two optional cases in which clearing not necessary */
   if (chunk_is_mmapped (p))
@@ -3853,40 +3921,7 @@  __libc_calloc (size_t n, size_t elem_size)
     }
 #endif
 
-  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
-     contents have an odd number of INTERNAL_SIZE_T-sized words;
-     minimally 3.  */
-  d = (INTERNAL_SIZE_T *) mem;
-  clearsize = csz - SIZE_SZ;
-  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
-  assert (nclears >= 3);
-
-  if (nclears > 9)
-    return memset (d, 0, clearsize);
-
-  else
-    {
-      *(d + 0) = 0;
-      *(d + 1) = 0;
-      *(d + 2) = 0;
-      if (nclears > 4)
-        {
-          *(d + 3) = 0;
-          *(d + 4) = 0;
-          if (nclears > 6)
-            {
-              *(d + 5) = 0;
-              *(d + 6) = 0;
-              if (nclears > 8)
-                {
-                  *(d + 7) = 0;
-                  *(d + 8) = 0;
-                }
-            }
-        }
-    }
-
-  return mem;
+  return clear_mem (mem, csz);
 }
 #endif /* IS_IN (libc) */
 
diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
index 01dd07004d..5302575ad1 100644
--- a/malloc/tst-safe-linking.c
+++ b/malloc/tst-safe-linking.c
@@ -111,22 +111,37 @@  test_fastbin (void *closure)
   int i;
   int mask = ((int *)closure)[0];
   size_t size = TCACHE_ALLOC_SIZE;
+  void * ps[TCACHE_FILL_COUNT];
+  void * pps[TCACHE_FILL_COUNT];
 
   printf ("++ fastbin ++\n");
 
+  /* Populate the fastbin list.  */
+  void * volatile a = calloc (1, size);
+  void * volatile b = calloc (1, size);
+  void * volatile c = calloc (1, size);
+  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+
+  /* Chunks for later tcache filling from fastbins.  */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      void * volatile p = calloc (1, size);
+      pps[i] = p;
+    }
+
   /* Take the tcache out of the game.  */
   for (i = 0; i < TCACHE_FILL_COUNT; ++i)
     {
       void * volatile p = calloc (1, size);
-      printf ("p=%p\n", p);
-      free (p);
+      ps[i] = p;
     }
 
-  /* Populate the fastbin list.  */
-  void * volatile a = calloc (1, size);
-  void * volatile b = calloc (1, size);
-  void * volatile c = calloc (1, size);
-  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (ps[i]);
+    }
+
+  /* Free abc will return to fastbin in FIFO order.  */
   free (a);
   free (b);
   free (c);
@@ -136,11 +151,43 @@  test_fastbin (void *closure)
   memset (c, mask & 0xFF, size);
   printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
 
+  /* Filling fastbins, will be copied to tcache later.  */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (pps[i]);
+    }
+
+  /* Drain out tcache to make sure later alloc from fastbins.  */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      void * volatile p = calloc (1, size);
+      ps[i] = p;
+    }
+
+  /* This line will also filling tcache with remain pps and c.  */
+  pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
+
+  /* Tcache is FILO, now the first one is c, take it out.  */
   c = calloc (1, size);
   printf ("Allocated: c=%p\n", c);
+
+  /* Drain out remain pps from tcache.  */
+  for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
+    {
+      void * volatile p = calloc (1, size);
+      pps[i] = p;
+    }
+
   /* This line will trigger the Safe-Linking check.  */
   b = calloc (1, size);
   printf ("b=%p\n", b);
+
+  /* Free previous pointers. */
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (ps[i]);
+      free (pps[i]);
+    }
 }
 
 /* Try corrupting the fastbin list and trigger a consolidate.  */
@@ -150,21 +197,29 @@  test_fastbin_consolidate (void *closure)
   int i;
   int mask = ((int*)closure)[0];
   size_t size = TCACHE_ALLOC_SIZE;
+  void * ps[TCACHE_FILL_COUNT];
 
   printf ("++ fastbin consolidate ++\n");
 
+  /* Populate the fastbin list.  */
+  void * volatile a = calloc (1, size);
+  void * volatile b = calloc (1, size);
+  void * volatile c = calloc (1, size);
+  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+
   /* Take the tcache out of the game.  */
   for (i = 0; i < TCACHE_FILL_COUNT; ++i)
     {
       void * volatile p = calloc (1, size);
-      free (p);
+      ps[i] = p;
     }
 
-  /* Populate the fastbin list.  */
-  void * volatile a = calloc (1, size);
-  void * volatile b = calloc (1, size);
-  void * volatile c = calloc (1, size);
-  printf ("a=%p, b=%p, c=%p\n", a, b, c);
+  for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+    {
+      free (ps[i]);
+    }
+
+  /* Free abc will return to fastbin.  */
   free (a);
   free (b);
   free (c);