[v4,3/3] malloc: Add tcache path for calloc
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
This commit add tcache support in calloc() which can largely improve
the performance of small size allocation, especially in multi-thread
scenario. clear_mem() and tcache_available() is split out as a helper
function for better reusing the code.
Also fix tst-safe-linking failure after enabling tcache. In previous,
calloc() is used as a way to by-pass tcache in memory allocation and
trigger safe-linking check in fastbins path. With tcache enabled, it
needs extra workarounds to bypass tcache.
Result of bench-malloc-thread benchmark
Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)
Threads# | Ratio
-----------|------
1 thread | 0.724
4 threads | 0.534
---
Changes in v3:
- Split out tcache_available() as helper function.
- Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
Changes in v2:
- Merge tst-safe-linking fix to make sure CI check pass.
- Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
---
malloc/malloc.c | 129 ++++++++++++++++++++++++--------------
malloc/tst-safe-linking.c | 81 ++++++++++++++++++++----
2 files changed, 150 insertions(+), 60 deletions(-)
Comments
On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> This commit add tcache support in calloc() which can largely improve
> the performance of small size allocation, especially in multi-thread
> scenario. clear_mem() and tcache_available() is split out as a helper
> function for better reusing the code.
>
> Also fix tst-safe-linking failure after enabling tcache. In previous,
> calloc() is used as a way to by-pass tcache in memory allocation and
> trigger safe-linking check in fastbins path. With tcache enabled, it
> needs extra workarounds to bypass tcache.
>
> Result of bench-malloc-thread benchmark
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads# | Ratio
> -----------|------
> 1 thread | 0.724
> 4 threads | 0.534
>
> ---
> Changes in v3:
> - Split out tcache_available() as helper function.
> - Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
> Changes in v2:
> - Merge tst-safe-linking fix to make sure CI check pass.
> - Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
> ---
> malloc/malloc.c | 129 ++++++++++++++++++++++++--------------
> malloc/tst-safe-linking.c | 81 ++++++++++++++++++++----
> 2 files changed, 150 insertions(+), 60 deletions(-)
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 81ddd2c3a8..1437ec20fb 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3208,6 +3208,18 @@ tcache_next (tcache_entry *e)
> return (tcache_entry *) REVEAL_PTR (e->next);
> }
>
> +/* Check if tcache is available for alloc by corresponding tc_idx. */
> +static __always_inline bool
> +tcache_availabe (size_t tc_idx)
> +{
> + if (tc_idx < mp_.tcache_bins
> + && tcache != NULL
> + && tcache->counts[tc_idx] > 0)
> + return true;
> + else
> + return false;
> +}
> +
> /* Verify if the suspicious tcache_entry is double free.
> It's not expected to execute very often, mark it as noinline. */
> static __attribute__ ((noinline)) void
> @@ -3366,9 +3378,7 @@ __libc_malloc (size_t bytes)
> MAYBE_INIT_TCACHE ();
>
> DIAG_PUSH_NEEDS_COMMENT;
> - if (tc_idx < mp_.tcache_bins
> - && tcache != NULL
> - && tcache->counts[tc_idx] > 0)
> + if (tcache_availabe (tc_idx))
> {
> victim = tcache_get (tc_idx);
> return tag_new_usable (victim);
> @@ -3667,9 +3677,7 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
> }
> size_t tc_idx = csize2tidx (tbytes);
>
> - if (tc_idx < mp_.tcache_bins
> - && tcache != NULL
> - && tcache->counts[tc_idx] > 0)
> + if (tcache_availabe (tc_idx))
> {
> /* The tcache itself isn't encoded, but the chain is. */
> tcache_entry **tep = & tcache->entries[tc_idx];
> @@ -3747,16 +3755,55 @@ __libc_pvalloc (size_t bytes)
> return _mid_memalign (pagesize, rounded_bytes, address);
> }
>
> +static __always_inline void *
> +clear_mem (void *mem, INTERNAL_SIZE_T csz)
> +{
> + INTERNAL_SIZE_T *d;
> + unsigned long clearsize, nclears;
> +
> + /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> + contents have an odd number of INTERNAL_SIZE_T-sized words;
> + minimally 3. */
> + d = (INTERNAL_SIZE_T *) mem;
> + clearsize = csz - SIZE_SZ;
> + nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> + assert (nclears >= 3);
> +
> + if (nclears > 9)
> + return memset (d, 0, clearsize);
> +
> + else
> + {
> + *(d + 0) = 0;
> + *(d + 1) = 0;
> + *(d + 2) = 0;
> + if (nclears > 4)
> + {
> + *(d + 3) = 0;
> + *(d + 4) = 0;
> + if (nclears > 6)
> + {
> + *(d + 5) = 0;
> + *(d + 6) = 0;
> + if (nclears > 8)
> + {
> + *(d + 7) = 0;
> + *(d + 8) = 0;
> + }
> + }
> + }
> + }
> +
> + return mem;
> +}
> +
> void *
> __libc_calloc (size_t n, size_t elem_size)
> {
> mstate av;
> - mchunkptr oldtop;
> - INTERNAL_SIZE_T sz, oldtopsize;
> + mchunkptr oldtop, p;
> + INTERNAL_SIZE_T sz, oldtopsize, csz;
> void *mem;
> - unsigned long clearsize;
> - unsigned long nclears;
> - INTERNAL_SIZE_T *d;
> ptrdiff_t bytes;
>
> if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3772,6 +3819,27 @@ __libc_calloc (size_t n, size_t elem_size)
>
> MAYBE_INIT_TCACHE ();
>
> +#if USE_TCACHE
> + /* int_free also calls request2size, be careful to not pad twice. */
> + size_t tbytes = checked_request2size (bytes);
> + if (tbytes == 0)
> + {
> + __set_errno (ENOMEM);
> + return NULL;
> + }
> + size_t tc_idx = csize2tidx (tbytes);
> +
> + if (tcache_availabe (tc_idx))
> + {
> + mem = tcache_get (tc_idx);
> + p = mem2chunk (mem);
> + if (__glibc_unlikely (mtag_enabled))
> + return tag_new_zero_region (mem, memsize (p));
> + csz = chunksize (p);
> + return clear_mem (mem, csz);
> + }
> +#endif
> +
> if (SINGLE_THREAD_P)
> av = &main_arena;
> else
> @@ -3826,7 +3894,7 @@ __libc_calloc (size_t n, size_t elem_size)
> if (mem == NULL)
> return NULL;
>
> - mchunkptr p = mem2chunk (mem);
> + p = mem2chunk (mem);
>
> /* If we are using memory tagging, then we need to set the tags
> regardless of MORECORE_CLEARS, so we zero the whole block while
> @@ -3834,7 +3902,7 @@ __libc_calloc (size_t n, size_t elem_size)
> if (__glibc_unlikely (mtag_enabled))
> return tag_new_zero_region (mem, memsize (p));
>
> - INTERNAL_SIZE_T csz = chunksize (p);
> + csz = chunksize (p);
>
> /* Two optional cases in which clearing not necessary */
> if (chunk_is_mmapped (p))
> @@ -3853,40 +3921,7 @@ __libc_calloc (size_t n, size_t elem_size)
> }
> #endif
>
> - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> - contents have an odd number of INTERNAL_SIZE_T-sized words;
> - minimally 3. */
> - d = (INTERNAL_SIZE_T *) mem;
> - clearsize = csz - SIZE_SZ;
> - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> - assert (nclears >= 3);
> -
> - if (nclears > 9)
> - return memset (d, 0, clearsize);
> -
> - else
> - {
> - *(d + 0) = 0;
> - *(d + 1) = 0;
> - *(d + 2) = 0;
> - if (nclears > 4)
> - {
> - *(d + 3) = 0;
> - *(d + 4) = 0;
> - if (nclears > 6)
> - {
> - *(d + 5) = 0;
> - *(d + 6) = 0;
> - if (nclears > 8)
> - {
> - *(d + 7) = 0;
> - *(d + 8) = 0;
> - }
> - }
> - }
> - }
> -
> - return mem;
> + return clear_mem (mem, csz);
> }
> #endif /* IS_IN (libc) */
>
> diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
> index 01dd07004d..5302575ad1 100644
> --- a/malloc/tst-safe-linking.c
> +++ b/malloc/tst-safe-linking.c
> @@ -111,22 +111,37 @@ test_fastbin (void *closure)
> int i;
> int mask = ((int *)closure)[0];
> size_t size = TCACHE_ALLOC_SIZE;
> + void * ps[TCACHE_FILL_COUNT];
> + void * pps[TCACHE_FILL_COUNT];
>
> printf ("++ fastbin ++\n");
>
> + /* Populate the fastbin list. */
> + void * volatile a = calloc (1, size);
> + void * volatile b = calloc (1, size);
> + void * volatile c = calloc (1, size);
> + printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
> + /* Chunks for later tcache filling from fastbins. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + void * volatile p = calloc (1, size);
> + pps[i] = p;
> + }
> +
> /* Take the tcache out of the game. */
> for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> {
> void * volatile p = calloc (1, size);
> - printf ("p=%p\n", p);
> - free (p);
> + ps[i] = p;
> }
>
> - /* Populate the fastbin list. */
> - void * volatile a = calloc (1, size);
> - void * volatile b = calloc (1, size);
> - void * volatile c = calloc (1, size);
> - printf ("a=%p, b=%p, c=%p\n", a, b, c);
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (ps[i]);
> + }
> +
> + /* Free abc will return to fastbin in FIFO order. */
> free (a);
> free (b);
> free (c);
> @@ -136,11 +151,43 @@ test_fastbin (void *closure)
> memset (c, mask & 0xFF, size);
> printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
>
> + /* Filling fastbins, will be copied to tcache later. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (pps[i]);
> + }
> +
> + /* Drain out tcache to make sure later alloc from fastbins. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + void * volatile p = calloc (1, size);
> + ps[i] = p;
> + }
> +
> + /* This line will also filling tcache with remain pps and c. */
> + pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
> +
> + /* Tcache is FILO, now the first one is c, take it out. */
> c = calloc (1, size);
> printf ("Allocated: c=%p\n", c);
> +
> + /* Drain out remain pps from tcache. */
> + for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
> + {
> + void * volatile p = calloc (1, size);
> + pps[i] = p;
> + }
> +
> /* This line will trigger the Safe-Linking check. */
> b = calloc (1, size);
> printf ("b=%p\n", b);
> +
> + /* Free previous pointers. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (ps[i]);
> + free (pps[i]);
> + }
> }
>
> /* Try corrupting the fastbin list and trigger a consolidate. */
> @@ -150,21 +197,29 @@ test_fastbin_consolidate (void *closure)
> int i;
> int mask = ((int*)closure)[0];
> size_t size = TCACHE_ALLOC_SIZE;
> + void * ps[TCACHE_FILL_COUNT];
>
> printf ("++ fastbin consolidate ++\n");
>
> + /* Populate the fastbin list. */
> + void * volatile a = calloc (1, size);
> + void * volatile b = calloc (1, size);
> + void * volatile c = calloc (1, size);
> + printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
> /* Take the tcache out of the game. */
> for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> {
> void * volatile p = calloc (1, size);
> - free (p);
> + ps[i] = p;
> }
>
> - /* Populate the fastbin list. */
> - void * volatile a = calloc (1, size);
> - void * volatile b = calloc (1, size);
> - void * volatile c = calloc (1, size);
> - printf ("a=%p, b=%p, c=%p\n", a, b, c);
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (ps[i]);
> + }
> +
> + /* Free abc will return to fastbin. */
> free (a);
> free (b);
> free (c);
> --
> 2.43.5
>
Since you are working on calloc, please try this patch to see if
it improves performance on x86-64.
Thanks.
On 11/26/2024 5:08 PM, H.J. Lu wrote:
> On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>> This commit add tcache support in calloc() which can largely improve
>> the performance of small size allocation, especially in multi-thread
>> scenario. clear_mem() and tcache_available() is split out as a helper
>> function for better reusing the code.
>>
>> Also fix tst-safe-linking failure after enabling tcache. In previous,
>> calloc() is used as a way to by-pass tcache in memory allocation and
>> trigger safe-linking check in fastbins path. With tcache enabled, it
>> needs extra workarounds to bypass tcache.
>>
>> Result of bench-malloc-thread benchmark
>>
>> Test Platform: Xeon-8380
>> Bench Function: calloc
>> Ratio: New / Original time_per_iteration (Lower is Better)
>>
>> Threads# | Ratio
>> -----------|------
>> 1 thread | 0.724
>> 4 threads | 0.534
>>
> Since you are working on calloc, please try this patch to see if
> it improves performance on x86-64.
>
> Thanks.
Look like the change is within variation. For bench-malloc-thread
benchmark, the cycles spent in this area is not very hot.
Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)
Threads# | Ratio
-----------|------
1 thread | 0.993
4 threads | 0.996
On Tue, Nov 26, 2024 at 5:39 PM Guo, Wangyang <wangyang.guo@intel.com> wrote:
>
> On 11/26/2024 5:08 PM, H.J. Lu wrote:
>
> > On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
> >> This commit add tcache support in calloc() which can largely improve
> >> the performance of small size allocation, especially in multi-thread
> >> scenario. clear_mem() and tcache_available() is split out as a helper
> >> function for better reusing the code.
> >>
> >> Also fix tst-safe-linking failure after enabling tcache. In previous,
> >> calloc() is used as a way to by-pass tcache in memory allocation and
> >> trigger safe-linking check in fastbins path. With tcache enabled, it
> >> needs extra workarounds to bypass tcache.
> >>
> >> Result of bench-malloc-thread benchmark
> >>
> >> Test Platform: Xeon-8380
> >> Bench Function: calloc
> >> Ratio: New / Original time_per_iteration (Lower is Better)
> >>
> >> Threads# | Ratio
> >> -----------|------
> >> 1 thread | 0.724
> >> 4 threads | 0.534
> >>
> > Since you are working on calloc, please try this patch to see if
> > it improves performance on x86-64.
> >
> > Thanks.
>
> Look like the change is within variation. For bench-malloc-thread
> benchmark, the cycles spent in this area is not very hot.
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads# | Ratio
> -----------|------
> 1 thread | 0.993
> 4 threads | 0.996
>
This patch reduces the number of branches from 3 to 1. How
does it perform?
On 11/27/2024 5:58 AM, H.J. Lu wrote:
> On Tue, Nov 26, 2024 at 5:39 PM Guo, Wangyang <wangyang.guo@intel.com> wrote:
>> On 11/26/2024 5:08 PM, H.J. Lu wrote:
>>
>>> On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>>>> This commit add tcache support in calloc() which can largely improve
>>>> the performance of small size allocation, especially in multi-thread
>>>> scenario. clear_mem() and tcache_available() is split out as a helper
>>>> function for better reusing the code.
>>>>
>>>> Also fix tst-safe-linking failure after enabling tcache. In previous,
>>>> calloc() is used as a way to by-pass tcache in memory allocation and
>>>> trigger safe-linking check in fastbins path. With tcache enabled, it
>>>> needs extra workarounds to bypass tcache.
>>>>
>>>> Result of bench-malloc-thread benchmark
>>>>
>>>> Test Platform: Xeon-8380
>>>> Bench Function: calloc
>>>> Ratio: New / Original time_per_iteration (Lower is Better)
>>>>
>>>> Threads# | Ratio
>>>> -----------|------
>>>> 1 thread | 0.724
>>>> 4 threads | 0.534
>>>>
>>> Since you are working on calloc, please try this patch to see if
>>> it improves performance on x86-64.
>>>
>>> Thanks.
>> Look like the change is within variation. For bench-malloc-thread
>> benchmark, the cycles spent in this area is not very hot.
>>
>> Test Platform: Xeon-8380
>> Bench Function: calloc
>> Ratio: New / Original time_per_iteration (Lower is Better)
>>
>> Threads# | Ratio
>> -----------|------
>> 1 thread | 0.993
>> 4 threads | 0.996
>>
> This patch reduces the number of branches from 3 to 1. How
> does it perform?
The patch can have significant performance gain in bench-malloc-thread:
Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)
Threads# | Ratio
-----------|------
1 thread | 0.953
4 threads | 0.952
On Wed, Nov 27, 2024, 8:51 AM Guo, Wangyang <wangyang.guo@intel.com> wrote:
> On 11/27/2024 5:58 AM, H.J. Lu wrote:
>
> > On Tue, Nov 26, 2024 at 5:39 PM Guo, Wangyang <wangyang.guo@intel.com>
> wrote:
> >> On 11/26/2024 5:08 PM, H.J. Lu wrote:
> >>
> >>> On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com>
> wrote:
> >>>> This commit add tcache support in calloc() which can largely improve
> >>>> the performance of small size allocation, especially in multi-thread
> >>>> scenario. clear_mem() and tcache_available() is split out as a helper
> >>>> function for better reusing the code.
> >>>>
> >>>> Also fix tst-safe-linking failure after enabling tcache. In previous,
> >>>> calloc() is used as a way to by-pass tcache in memory allocation and
> >>>> trigger safe-linking check in fastbins path. With tcache enabled, it
> >>>> needs extra workarounds to bypass tcache.
> >>>>
> >>>> Result of bench-malloc-thread benchmark
> >>>>
> >>>> Test Platform: Xeon-8380
> >>>> Bench Function: calloc
> >>>> Ratio: New / Original time_per_iteration (Lower is Better)
> >>>>
> >>>> Threads# | Ratio
> >>>> -----------|------
> >>>> 1 thread | 0.724
> >>>> 4 threads | 0.534
> >>>>
> >>> Since you are working on calloc, please try this patch to see if
> >>> it improves performance on x86-64.
> >>>
> >>> Thanks.
> >> Look like the change is within variation. For bench-malloc-thread
> >> benchmark, the cycles spent in this area is not very hot.
> >>
> >> Test Platform: Xeon-8380
> >> Bench Function: calloc
> >> Ratio: New / Original time_per_iteration (Lower is Better)
> >>
> >> Threads# | Ratio
> >> -----------|------
> >> 1 thread | 0.993
> >> 4 threads | 0.996
> >>
> > This patch reduces the number of branches from 3 to 1. How
> > does it perform?
>
> The patch can have significant performance gain in bench-malloc-thread:
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads# | Ratio
> -----------|------
> 1 thread | 0.953
> 4 threads | 0.952
>
Great. I will submit it later today.
>
> H.J.
On Tue, Nov 26, 2024 at 3:37 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> This commit add tcache support in calloc() which can largely improve
> the performance of small size allocation, especially in multi-thread
> scenario. clear_mem() and tcache_available() is split out as a helper
> function for better reusing the code.
>
> Also fix tst-safe-linking failure after enabling tcache. In previous,
> calloc() is used as a way to by-pass tcache in memory allocation and
> trigger safe-linking check in fastbins path. With tcache enabled, it
> needs extra workarounds to bypass tcache.
>
> Result of bench-malloc-thread benchmark
>
> Test Platform: Xeon-8380
> Bench Function: calloc
> Ratio: New / Original time_per_iteration (Lower is Better)
>
> Threads# | Ratio
> -----------|------
> 1 thread | 0.724
> 4 threads | 0.534
Please rebase.
> ---
> Changes in v3:
> - Split out tcache_available() as helper function.
> - Link to v2: https://sourceware.org/pipermail/libc-alpha/2024-August/159430.html
> Changes in v2:
> - Merge tst-safe-linking fix to make sure CI check pass.
> - Link to v1: https://sourceware.org/pipermail/libc-alpha/2024-August/159362.html
> ---
> malloc/malloc.c | 129 ++++++++++++++++++++++++--------------
> malloc/tst-safe-linking.c | 81 ++++++++++++++++++++----
> 2 files changed, 150 insertions(+), 60 deletions(-)
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 81ddd2c3a8..1437ec20fb 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3208,6 +3208,18 @@ tcache_next (tcache_entry *e)
> return (tcache_entry *) REVEAL_PTR (e->next);
> }
>
> +/* Check if tcache is available for alloc by corresponding tc_idx. */
> +static __always_inline bool
> +tcache_availabe (size_t tc_idx)
> +{
> + if (tc_idx < mp_.tcache_bins
> + && tcache != NULL
> + && tcache->counts[tc_idx] > 0)
> + return true;
> + else
> + return false;
> +}
> +
> /* Verify if the suspicious tcache_entry is double free.
> It's not expected to execute very often, mark it as noinline. */
> static __attribute__ ((noinline)) void
> @@ -3366,9 +3378,7 @@ __libc_malloc (size_t bytes)
> MAYBE_INIT_TCACHE ();
>
> DIAG_PUSH_NEEDS_COMMENT;
> - if (tc_idx < mp_.tcache_bins
> - && tcache != NULL
> - && tcache->counts[tc_idx] > 0)
> + if (tcache_availabe (tc_idx))
> {
> victim = tcache_get (tc_idx);
> return tag_new_usable (victim);
> @@ -3667,9 +3677,7 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
> }
> size_t tc_idx = csize2tidx (tbytes);
>
> - if (tc_idx < mp_.tcache_bins
> - && tcache != NULL
> - && tcache->counts[tc_idx] > 0)
> + if (tcache_availabe (tc_idx))
> {
> /* The tcache itself isn't encoded, but the chain is. */
> tcache_entry **tep = & tcache->entries[tc_idx];
> @@ -3747,16 +3755,55 @@ __libc_pvalloc (size_t bytes)
> return _mid_memalign (pagesize, rounded_bytes, address);
> }
>
> +static __always_inline void *
> +clear_mem (void *mem, INTERNAL_SIZE_T csz)
> +{
> + INTERNAL_SIZE_T *d;
> + unsigned long clearsize, nclears;
> +
> + /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> + contents have an odd number of INTERNAL_SIZE_T-sized words;
> + minimally 3. */
> + d = (INTERNAL_SIZE_T *) mem;
> + clearsize = csz - SIZE_SZ;
> + nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> + assert (nclears >= 3);
> +
> + if (nclears > 9)
> + return memset (d, 0, clearsize);
> +
> + else
> + {
> + *(d + 0) = 0;
> + *(d + 1) = 0;
> + *(d + 2) = 0;
> + if (nclears > 4)
> + {
> + *(d + 3) = 0;
> + *(d + 4) = 0;
> + if (nclears > 6)
> + {
> + *(d + 5) = 0;
> + *(d + 6) = 0;
> + if (nclears > 8)
> + {
> + *(d + 7) = 0;
> + *(d + 8) = 0;
> + }
> + }
> + }
> + }
> +
> + return mem;
> +}
> +
> void *
> __libc_calloc (size_t n, size_t elem_size)
> {
> mstate av;
> - mchunkptr oldtop;
> - INTERNAL_SIZE_T sz, oldtopsize;
> + mchunkptr oldtop, p;
> + INTERNAL_SIZE_T sz, oldtopsize, csz;
> void *mem;
> - unsigned long clearsize;
> - unsigned long nclears;
> - INTERNAL_SIZE_T *d;
> ptrdiff_t bytes;
>
> if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3772,6 +3819,27 @@ __libc_calloc (size_t n, size_t elem_size)
>
> MAYBE_INIT_TCACHE ();
>
> +#if USE_TCACHE
> + /* int_free also calls request2size, be careful to not pad twice. */
> + size_t tbytes = checked_request2size (bytes);
> + if (tbytes == 0)
> + {
> + __set_errno (ENOMEM);
> + return NULL;
> + }
> + size_t tc_idx = csize2tidx (tbytes);
> +
> + if (tcache_availabe (tc_idx))
> + {
> + mem = tcache_get (tc_idx);
> + p = mem2chunk (mem);
> + if (__glibc_unlikely (mtag_enabled))
> + return tag_new_zero_region (mem, memsize (p));
> + csz = chunksize (p);
> + return clear_mem (mem, csz);
> + }
> +#endif
> +
> if (SINGLE_THREAD_P)
> av = &main_arena;
> else
> @@ -3826,7 +3894,7 @@ __libc_calloc (size_t n, size_t elem_size)
> if (mem == NULL)
> return NULL;
>
> - mchunkptr p = mem2chunk (mem);
> + p = mem2chunk (mem);
>
> /* If we are using memory tagging, then we need to set the tags
> regardless of MORECORE_CLEARS, so we zero the whole block while
> @@ -3834,7 +3902,7 @@ __libc_calloc (size_t n, size_t elem_size)
> if (__glibc_unlikely (mtag_enabled))
> return tag_new_zero_region (mem, memsize (p));
>
> - INTERNAL_SIZE_T csz = chunksize (p);
> + csz = chunksize (p);
>
> /* Two optional cases in which clearing not necessary */
> if (chunk_is_mmapped (p))
> @@ -3853,40 +3921,7 @@ __libc_calloc (size_t n, size_t elem_size)
> }
> #endif
>
> - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> - contents have an odd number of INTERNAL_SIZE_T-sized words;
> - minimally 3. */
> - d = (INTERNAL_SIZE_T *) mem;
> - clearsize = csz - SIZE_SZ;
> - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> - assert (nclears >= 3);
> -
> - if (nclears > 9)
> - return memset (d, 0, clearsize);
> -
> - else
> - {
> - *(d + 0) = 0;
> - *(d + 1) = 0;
> - *(d + 2) = 0;
> - if (nclears > 4)
> - {
> - *(d + 3) = 0;
> - *(d + 4) = 0;
> - if (nclears > 6)
> - {
> - *(d + 5) = 0;
> - *(d + 6) = 0;
> - if (nclears > 8)
> - {
> - *(d + 7) = 0;
> - *(d + 8) = 0;
> - }
> - }
> - }
> - }
> -
> - return mem;
> + return clear_mem (mem, csz);
> }
> #endif /* IS_IN (libc) */
>
> diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
> index 01dd07004d..5302575ad1 100644
> --- a/malloc/tst-safe-linking.c
> +++ b/malloc/tst-safe-linking.c
> @@ -111,22 +111,37 @@ test_fastbin (void *closure)
> int i;
> int mask = ((int *)closure)[0];
> size_t size = TCACHE_ALLOC_SIZE;
> + void * ps[TCACHE_FILL_COUNT];
> + void * pps[TCACHE_FILL_COUNT];
>
> printf ("++ fastbin ++\n");
>
> + /* Populate the fastbin list. */
> + void * volatile a = calloc (1, size);
> + void * volatile b = calloc (1, size);
> + void * volatile c = calloc (1, size);
> + printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
> + /* Chunks for later tcache filling from fastbins. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + void * volatile p = calloc (1, size);
> + pps[i] = p;
> + }
> +
> /* Take the tcache out of the game. */
> for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> {
> void * volatile p = calloc (1, size);
> - printf ("p=%p\n", p);
> - free (p);
> + ps[i] = p;
> }
>
> - /* Populate the fastbin list. */
> - void * volatile a = calloc (1, size);
> - void * volatile b = calloc (1, size);
> - void * volatile c = calloc (1, size);
> - printf ("a=%p, b=%p, c=%p\n", a, b, c);
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (ps[i]);
> + }
> +
> + /* Free abc will return to fastbin in FIFO order. */
> free (a);
> free (b);
> free (c);
> @@ -136,11 +151,43 @@ test_fastbin (void *closure)
> memset (c, mask & 0xFF, size);
> printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
>
> + /* Filling fastbins, will be copied to tcache later. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (pps[i]);
> + }
> +
> + /* Drain out tcache to make sure later alloc from fastbins. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + void * volatile p = calloc (1, size);
> + ps[i] = p;
> + }
> +
> + /* This line will also filling tcache with remain pps and c. */
> + pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
> +
> + /* Tcache is FILO, now the first one is c, take it out. */
> c = calloc (1, size);
> printf ("Allocated: c=%p\n", c);
> +
> + /* Drain out remain pps from tcache. */
> + for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
> + {
> + void * volatile p = calloc (1, size);
> + pps[i] = p;
> + }
> +
> /* This line will trigger the Safe-Linking check. */
> b = calloc (1, size);
> printf ("b=%p\n", b);
> +
> + /* Free previous pointers. */
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (ps[i]);
> + free (pps[i]);
> + }
> }
>
> /* Try corrupting the fastbin list and trigger a consolidate. */
> @@ -150,21 +197,29 @@ test_fastbin_consolidate (void *closure)
> int i;
> int mask = ((int*)closure)[0];
> size_t size = TCACHE_ALLOC_SIZE;
> + void * ps[TCACHE_FILL_COUNT];
>
> printf ("++ fastbin consolidate ++\n");
>
> + /* Populate the fastbin list. */
> + void * volatile a = calloc (1, size);
> + void * volatile b = calloc (1, size);
> + void * volatile c = calloc (1, size);
> + printf ("a=%p, b=%p, c=%p\n", a, b, c);
> +
> /* Take the tcache out of the game. */
> for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> {
> void * volatile p = calloc (1, size);
> - free (p);
> + ps[i] = p;
> }
>
> - /* Populate the fastbin list. */
> - void * volatile a = calloc (1, size);
> - void * volatile b = calloc (1, size);
> - void * volatile c = calloc (1, size);
> - printf ("a=%p, b=%p, c=%p\n", a, b, c);
> + for (i = 0; i < TCACHE_FILL_COUNT; ++i)
> + {
> + free (ps[i]);
> + }
> +
> + /* Free abc will return to fastbin. */
> free (a);
> free (b);
> free (c);
> --
> 2.43.5
>
@@ -3208,6 +3208,18 @@ tcache_next (tcache_entry *e)
return (tcache_entry *) REVEAL_PTR (e->next);
}
+/* Check if tcache is available for alloc by corresponding tc_idx. */
+static __always_inline bool
+tcache_availabe (size_t tc_idx)
+{
+ if (tc_idx < mp_.tcache_bins
+ && tcache != NULL
+ && tcache->counts[tc_idx] > 0)
+ return true;
+ else
+ return false;
+}
+
/* Verify if the suspicious tcache_entry is double free.
It's not expected to execute very often, mark it as noinline. */
static __attribute__ ((noinline)) void
@@ -3366,9 +3378,7 @@ __libc_malloc (size_t bytes)
MAYBE_INIT_TCACHE ();
DIAG_PUSH_NEEDS_COMMENT;
- if (tc_idx < mp_.tcache_bins
- && tcache != NULL
- && tcache->counts[tc_idx] > 0)
+ if (tcache_availabe (tc_idx))
{
victim = tcache_get (tc_idx);
return tag_new_usable (victim);
@@ -3667,9 +3677,7 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
}
size_t tc_idx = csize2tidx (tbytes);
- if (tc_idx < mp_.tcache_bins
- && tcache != NULL
- && tcache->counts[tc_idx] > 0)
+ if (tcache_availabe (tc_idx))
{
/* The tcache itself isn't encoded, but the chain is. */
tcache_entry **tep = & tcache->entries[tc_idx];
@@ -3747,16 +3755,55 @@ __libc_pvalloc (size_t bytes)
return _mid_memalign (pagesize, rounded_bytes, address);
}
+static __always_inline void *
+clear_mem (void *mem, INTERNAL_SIZE_T csz)
+{
+ INTERNAL_SIZE_T *d;
+ unsigned long clearsize, nclears;
+
+ /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
+ contents have an odd number of INTERNAL_SIZE_T-sized words;
+ minimally 3. */
+ d = (INTERNAL_SIZE_T *) mem;
+ clearsize = csz - SIZE_SZ;
+ nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+ assert (nclears >= 3);
+
+ if (nclears > 9)
+ return memset (d, 0, clearsize);
+
+ else
+ {
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ if (nclears > 4)
+ {
+ *(d + 3) = 0;
+ *(d + 4) = 0;
+ if (nclears > 6)
+ {
+ *(d + 5) = 0;
+ *(d + 6) = 0;
+ if (nclears > 8)
+ {
+ *(d + 7) = 0;
+ *(d + 8) = 0;
+ }
+ }
+ }
+ }
+
+ return mem;
+}
+
void *
__libc_calloc (size_t n, size_t elem_size)
{
mstate av;
- mchunkptr oldtop;
- INTERNAL_SIZE_T sz, oldtopsize;
+ mchunkptr oldtop, p;
+ INTERNAL_SIZE_T sz, oldtopsize, csz;
void *mem;
- unsigned long clearsize;
- unsigned long nclears;
- INTERNAL_SIZE_T *d;
ptrdiff_t bytes;
if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3772,6 +3819,27 @@ __libc_calloc (size_t n, size_t elem_size)
MAYBE_INIT_TCACHE ();
+#if USE_TCACHE
+ /* int_free also calls request2size, be careful to not pad twice. */
+ size_t tbytes = checked_request2size (bytes);
+ if (tbytes == 0)
+ {
+ __set_errno (ENOMEM);
+ return NULL;
+ }
+ size_t tc_idx = csize2tidx (tbytes);
+
+ if (tcache_availabe (tc_idx))
+ {
+ mem = tcache_get (tc_idx);
+ p = mem2chunk (mem);
+ if (__glibc_unlikely (mtag_enabled))
+ return tag_new_zero_region (mem, memsize (p));
+ csz = chunksize (p);
+ return clear_mem (mem, csz);
+ }
+#endif
+
if (SINGLE_THREAD_P)
av = &main_arena;
else
@@ -3826,7 +3894,7 @@ __libc_calloc (size_t n, size_t elem_size)
if (mem == NULL)
return NULL;
- mchunkptr p = mem2chunk (mem);
+ p = mem2chunk (mem);
/* If we are using memory tagging, then we need to set the tags
regardless of MORECORE_CLEARS, so we zero the whole block while
@@ -3834,7 +3902,7 @@ __libc_calloc (size_t n, size_t elem_size)
if (__glibc_unlikely (mtag_enabled))
return tag_new_zero_region (mem, memsize (p));
- INTERNAL_SIZE_T csz = chunksize (p);
+ csz = chunksize (p);
/* Two optional cases in which clearing not necessary */
if (chunk_is_mmapped (p))
@@ -3853,40 +3921,7 @@ __libc_calloc (size_t n, size_t elem_size)
}
#endif
- /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
- contents have an odd number of INTERNAL_SIZE_T-sized words;
- minimally 3. */
- d = (INTERNAL_SIZE_T *) mem;
- clearsize = csz - SIZE_SZ;
- nclears = clearsize / sizeof (INTERNAL_SIZE_T);
- assert (nclears >= 3);
-
- if (nclears > 9)
- return memset (d, 0, clearsize);
-
- else
- {
- *(d + 0) = 0;
- *(d + 1) = 0;
- *(d + 2) = 0;
- if (nclears > 4)
- {
- *(d + 3) = 0;
- *(d + 4) = 0;
- if (nclears > 6)
- {
- *(d + 5) = 0;
- *(d + 6) = 0;
- if (nclears > 8)
- {
- *(d + 7) = 0;
- *(d + 8) = 0;
- }
- }
- }
- }
-
- return mem;
+ return clear_mem (mem, csz);
}
#endif /* IS_IN (libc) */
@@ -111,22 +111,37 @@ test_fastbin (void *closure)
int i;
int mask = ((int *)closure)[0];
size_t size = TCACHE_ALLOC_SIZE;
+ void * ps[TCACHE_FILL_COUNT];
+ void * pps[TCACHE_FILL_COUNT];
printf ("++ fastbin ++\n");
+ /* Populate the fastbin list. */
+ void * volatile a = calloc (1, size);
+ void * volatile b = calloc (1, size);
+ void * volatile c = calloc (1, size);
+ printf ("a=%p, b=%p, c=%p\n", a, b, c);
+
+ /* Chunks for later tcache filling from fastbins. */
+ for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+ {
+ void * volatile p = calloc (1, size);
+ pps[i] = p;
+ }
+
/* Take the tcache out of the game. */
for (i = 0; i < TCACHE_FILL_COUNT; ++i)
{
void * volatile p = calloc (1, size);
- printf ("p=%p\n", p);
- free (p);
+ ps[i] = p;
}
- /* Populate the fastbin list. */
- void * volatile a = calloc (1, size);
- void * volatile b = calloc (1, size);
- void * volatile c = calloc (1, size);
- printf ("a=%p, b=%p, c=%p\n", a, b, c);
+ for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+ {
+ free (ps[i]);
+ }
+
+ /* Free abc will return to fastbin in FIFO order. */
free (a);
free (b);
free (c);
@@ -136,11 +151,43 @@ test_fastbin (void *closure)
memset (c, mask & 0xFF, size);
printf ("After: c=%p, c[0]=%p\n", c, ((void **)c)[0]);
+ /* Filling fastbins, will be copied to tcache later. */
+ for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+ {
+ free (pps[i]);
+ }
+
+ /* Drain out tcache to make sure later alloc from fastbins. */
+ for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+ {
+ void * volatile p = calloc (1, size);
+ ps[i] = p;
+ }
+
+ /* This line will also filling tcache with remain pps and c. */
+ pps[TCACHE_FILL_COUNT - 1] = calloc (1, size);
+
+ /* Tcache is FILO, now the first one is c, take it out. */
c = calloc (1, size);
printf ("Allocated: c=%p\n", c);
+
+ /* Drain out remain pps from tcache. */
+ for (i = 0; i < TCACHE_FILL_COUNT - 1; ++i)
+ {
+ void * volatile p = calloc (1, size);
+ pps[i] = p;
+ }
+
/* This line will trigger the Safe-Linking check. */
b = calloc (1, size);
printf ("b=%p\n", b);
+
+ /* Free previous pointers. */
+ for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+ {
+ free (ps[i]);
+ free (pps[i]);
+ }
}
/* Try corrupting the fastbin list and trigger a consolidate. */
@@ -150,21 +197,29 @@ test_fastbin_consolidate (void *closure)
int i;
int mask = ((int*)closure)[0];
size_t size = TCACHE_ALLOC_SIZE;
+ void * ps[TCACHE_FILL_COUNT];
printf ("++ fastbin consolidate ++\n");
+ /* Populate the fastbin list. */
+ void * volatile a = calloc (1, size);
+ void * volatile b = calloc (1, size);
+ void * volatile c = calloc (1, size);
+ printf ("a=%p, b=%p, c=%p\n", a, b, c);
+
/* Take the tcache out of the game. */
for (i = 0; i < TCACHE_FILL_COUNT; ++i)
{
void * volatile p = calloc (1, size);
- free (p);
+ ps[i] = p;
}
- /* Populate the fastbin list. */
- void * volatile a = calloc (1, size);
- void * volatile b = calloc (1, size);
- void * volatile c = calloc (1, size);
- printf ("a=%p, b=%p, c=%p\n", a, b, c);
+ for (i = 0; i < TCACHE_FILL_COUNT; ++i)
+ {
+ free (ps[i]);
+ }
+
+ /* Free abc will return to fastbin. */
free (a);
free (b);
free (c);