[updated] malloc per-thread cache ready for review

Message ID xna88tzpuu.fsf@greed.delorie.com
State Committed
Headers

Commit Message

DJ Delorie March 10, 2017, 5:02 a.m. UTC
  Branch updated...

    Fix whitespace around operators.
    
    Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper
    
    Add helper functions for tcache_get() and tcache_put() to collect
    common code in one spot.

Updated patch attached.  (git diff origin/master...origin/dj/malloc-tcache)

I also put a copy of the oocalc workload online, as a sample of the
workloads I've been using to benchmark.  It's 16M compressed, and you
have to decompress it (to 65M) to use it with the simulator.

  http://www.delorie.com/tmp/oocalc.wl.bz2

(if you use wget, use "-U malloc" ;)
  

Comments

Will Newton March 17, 2017, 10:49 a.m. UTC | #1
On Fri, Mar 10, 2017 at 5:02 AM, DJ Delorie <dj@redhat.com> wrote:
>
> Branch updated...
>
>     Fix whitespace around operators.
>
>     Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper
>
>     Add helper functions for tcache_get() and tcache_put() to collect
>     common code in one spot.
>
> Updated patch attached.  (git diff origin/master...origin/dj/malloc-tcache)

This looks ok to me from a basic review - the code looks reasonable,
no testsuite regressions or warnings added.

It could do with a ChangeLog and NEWS entry though.

It would be good to get wider testing of the performance on
performance critical workloads that other people may have.

> I also put a copy of the oocalc workload online, as a sample of the
> workloads I've been using to benchmark.  It's 16M compressed, and you
> have to decompress it (to 65M) to use it with the simulator.
>
>   http://www.delorie.com/tmp/oocalc.wl.bz2
>
> (if you use wget, use "-U malloc" ;)
>
> diff --git a/config.make.in b/config.make.in
> index 5836b32..0290d83 100644
> --- a/config.make.in
> +++ b/config.make.in
> @@ -77,6 +77,8 @@ multi-arch = @multi_arch@
>
>  mach-interface-list = @mach_interface_list@
>
> +experimental-malloc = @experimental_malloc@
> +
>  nss-crypt = @libc_cv_nss_crypt@
>  static-nss-crypt = @libc_cv_static_nss_crypt@
>
> diff --git a/configure.ac b/configure.ac
> index 4a77411..b929012 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch],
>               [multi_arch=$enableval],
>               [multi_arch=default])
>
> +AC_ARG_ENABLE([experimental-malloc],
> +             AC_HELP_STRING([--disable-experimental-malloc],
> +                            [disable experimental malloc features]),
> +             [experimental_malloc=$enableval],
> +             [experimental_malloc=yes])
> +AC_SUBST(experimental_malloc)
> +
>  AC_ARG_ENABLE([nss-crypt],
>               AC_HELP_STRING([--enable-nss-crypt],
>                              [enable libcrypt to use nss]),
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index cb9e8f1..37620c8 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -76,5 +76,17 @@ glibc {
>        minval: 1
>        security_level: SXID_IGNORE
>      }
> +    tcache_max {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_MAX
> +    }
> +    tcache_count {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_COUNT
> +    }
> +    tcache_unsorted_limit {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
> +    }
>    }
>  }
> diff --git a/malloc/Makefile b/malloc/Makefile
> index e93b83b..dd8a43a 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
>  tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
>  tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
>
> +ifeq ($(experimental-malloc),yes)
> +CPPFLAGS-malloc.c += -DUSE_TCACHE
> +endif
>  # Uncomment this for test releases.  For public releases it is too expensive.
>  #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
>
> diff --git a/malloc/arena.c b/malloc/arena.c
> index d49e4a2..79e918f 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
> +#if USE_TCACHE
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
> +#endif
>  #else
>  /* Initialization routine. */
>  #include <string.h>
> @@ -322,6 +327,11 @@ ptmalloc_init (void)
>    TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
>    TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
>    TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
> +#if USE_TCACHE
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);
> +#endif
>    __libc_lock_unlock (main_arena.mutex);
>  #else
>    const char *s = NULL;
> @@ -371,7 +381,23 @@ ptmalloc_init (void)
>                    if (memcmp (envline, "ARENA_TEST", 10) == 0)
>                      __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
>                  }
> +#if USE_TCACHE
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_MAX", 10) == 0)
> +                    __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
> +                }
> +#endif
>                break;
> +#if USE_TCACHE
> +            case 12:
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
> +                    __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
> +                }
> +             break;
> +#endif
>              case 15:
>                if (!__builtin_expect (__libc_enable_secure, 0))
>                  {
> @@ -381,6 +407,15 @@ ptmalloc_init (void)
>                      __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
>                  }
>                break;
> +#if USE_TCACHE
> +            case 21:
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
> +                    __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
> +                }
> +             break;
> +#endif
>              default:
>                break;
>              }
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index e29105c..8cd03d8 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
>  }
>  #endif
>
> +#ifndef USE_TCACHE
> +# define USE_TCACHE 0
> +#endif
> +#if USE_TCACHE
> +/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
> +# define MAX_TCACHE_SIZE       (MALLOC_ALIGNMENT * 63)
> +# define TCACHE_IDX            ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
> +# define size2tidx_(bytes)     (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
> +
> +# define tidx2csize(idx)       ((idx) * MALLOC_ALIGNMENT + SIZE_SZ)
> +# define tidx2usize(idx)       ((idx) * MALLOC_ALIGNMENT)
> +
> +/* When "x" is a user-provided size.  */
> +# define usize2tidx(x) size2tidx_ (x)
> +/* When "x" is from chunksize().  */
> +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ)
> +
> +/* Rounds up, so...
> +   idx 0   bytes 0
> +   idx 1   bytes 1..8
> +   idx 2   bytes 9..16
> +   etc.  */
> +
> +/* This is another arbitrary limit, which tunables can change.  */
> +# define TCACHE_FILL_COUNT 7
> +#endif
> +
>
>  /*
>    REALLOC_ZERO_BYTES_FREES should be set if a call to
> @@ -1711,6 +1738,17 @@ struct malloc_par
>
>    /* First address handed out by MORECORE/sbrk.  */
>    char *sbrk_base;
> +
> +#if USE_TCACHE
> +  /* Maximum number of buckets to use.  */
> +  size_t tcache_max;
> +  size_t tcache_max_bytes;
> +  /* Maximum number of chunks in each bucket.  */
> +  size_t tcache_count;
> +  /* Maximum number of chunks to remove from the unsorted list, which
> +     don't match.  */
> +  size_t tcache_unsorted_limit;
> +#endif
>  };
>
>  /* There are several instances of this struct ("arenas") in this
> @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ =
>    .trim_threshold = DEFAULT_TRIM_THRESHOLD,
>  #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
>    .arena_test = NARENAS_FROM_NCORES (1)
> +#if USE_TCACHE
> +  ,
> +  .tcache_count = TCACHE_FILL_COUNT,
> +  .tcache_max = TCACHE_IDX,
> +  .tcache_max_bytes = tidx2usize (TCACHE_IDX-1),
> +  .tcache_unsorted_limit = 0 /* No limit */
> +#endif
>  };
>
> +/*  Non public mallopt parameters.  */
> +#if USE_TCACHE
> +# define M_TCACHE_COUNT  -9
> +# define M_TCACHE_MAX  -10
> +# define M_TCACHE_UNSORTED_LIMIT  -11
> +#endif
> +
>  /* Maximum size of memory handled in fastbins.  */
>  static INTERNAL_SIZE_T global_max_fast;
>
> @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size)
>
>  /*------------------------ Public wrappers. --------------------------------*/
>
> +#if USE_TCACHE
> +
> +typedef struct TCacheEntry {
> +  struct TCacheEntry *next;
> +} TCacheEntry;
> +
> +/* There is one of these for each thread, which contains the
> +   per-thread cache (hence "TCache").  Keeping overall size low is
> +   mildly important.  Note that COUNTS and ENTRIES are redundant, this
> +   is for performance reasons.  */
> +typedef struct TCache {
> +  char counts[TCACHE_IDX];
> +  TCacheEntry *entries[TCACHE_IDX];
> +} TCache;
> +
> +static __thread char tcache_shutting_down = 0;
> +static __thread TCache *tcache = NULL;
> +
> +static void
> +tcache_put (mchunkptr chunk, size_t tc_idx)
> +{
> +  TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk);
> +  e->next = tcache->entries[tc_idx];
> +  tcache->entries[tc_idx] = e;
> +  ++(tcache->counts[tc_idx]);
> +}
> +
> +static void *
> +tcache_get (size_t tc_idx)
> +{
> +  TCacheEntry *e = tcache->entries[tc_idx];
> +  tcache->entries[tc_idx] = e->next;
> +  --(tcache->counts[tc_idx]);
> +  return (void *) e;
> +}
> +
> +static void __attribute__ ((section ("__libc_thread_freeres_fn")))
> +tcache_thread_freeres (void)
> +{
> +  int i;
> +  TCache *tcache_tmp = tcache;
> +
> +  if (!tcache)
> +    return;
> +
> +  tcache = NULL;
> +
> +  for (i = 0; i < TCACHE_IDX; ++i) {
> +    while (tcache_tmp->entries[i])
> +      {
> +       TCacheEntry *e = tcache_tmp->entries[i];
> +       tcache_tmp->entries[i] = e->next;
> +       __libc_free (e);
> +      }
> +  }
> +
> +  __libc_free (tcache_tmp);
> +
> +  tcache_shutting_down = 1;
> +}
> +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
> +
> +static void
> +tcache_init(void)
> +{
> +  mstate ar_ptr;
> +  void *victim = 0;
> +  const size_t bytes = sizeof (TCache);
> +
> +  if (tcache_shutting_down)
> +    return;
> +
> +  arena_get (ar_ptr, bytes);
> +  victim = _int_malloc (ar_ptr, bytes);
> +  if (!victim && ar_ptr != NULL)
> +    {
> +      ar_ptr = arena_get_retry (ar_ptr, bytes);
> +      victim = _int_malloc (ar_ptr, bytes);
> +    }
> +
> +
> +  if (ar_ptr != NULL)
> +    __libc_lock_unlock (ar_ptr->mutex);
> +
> +  if (victim)
> +    {
> +      tcache = (TCache *) victim;
> +      memset (tcache, 0, sizeof (TCache));
> +    }
> +
> +}
> +
> +#define MAYBE_INIT_TCACHE() \
> +  if (__glibc_unlikely (tcache == NULL)) \
> +    tcache_init();
> +
> +#else
> +#define MAYBE_INIT_TCACHE()
> +#endif
> +
>  void *
>  __libc_malloc (size_t bytes)
>  {
> @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes)
>      = atomic_forced_read (__malloc_hook);
>    if (__builtin_expect (hook != NULL, 0))
>      return (*hook)(bytes, RETURN_ADDRESS (0));
> +#if USE_TCACHE
> +  /* int_free also calls request2size, be careful to not pad twice.  */
> +  size_t tbytes = request2size (bytes);
> +  size_t tc_idx = csize2tidx (tbytes);
> +
> +  MAYBE_INIT_TCACHE ();
> +
> +  if (tc_idx < mp_.tcache_max
> +      && tc_idx < TCACHE_IDX /* to appease gcc */
> +      && tcache
> +      && tcache->entries[tc_idx] != NULL)
> +    {
> +      return tcache_get (tc_idx);
> +    }
> +#endif
>
>    arena_get (ar_ptr, bytes);
>
> @@ -2943,6 +3110,8 @@ __libc_free (void *mem)
>        return;
>      }
>
> +  MAYBE_INIT_TCACHE ();
> +
>    ar_ptr = arena_for_chunk (p);
>    _int_free (ar_ptr, p, 0);
>  }
> @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes)
>    if (chunk_is_mmapped (oldp))
>      ar_ptr = NULL;
>    else
> -    ar_ptr = arena_for_chunk (oldp);
> +    {
> +      MAYBE_INIT_TCACHE ();
> +      ar_ptr = arena_for_chunk (oldp);
> +    }
>
>    /* Little security check which won't hurt performance: the allocator
>       never wrapps around at the end of the address space.  Therefore
> @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size)
>
>    sz = bytes;
>
> +  MAYBE_INIT_TCACHE ();
> +
>    arena_get (av, sz);
>    if (av)
>      {
> @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes)
>    mchunkptr fwd;                    /* misc temp for linking */
>    mchunkptr bck;                    /* misc temp for linking */
>
> +#if USE_TCACHE
> +  size_t tcache_unsorted_count;            /* count of unsorted chunks processed */
> +#endif
> +
>    const char *errstr = NULL;
>
>    /*
> @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes)
>                return NULL;
>              }
>            check_remalloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> +         /* While we're here, if we see other chunks of the same size,
> +            stash them in the tcache.  */
> +         size_t tc_idx = csize2tidx (nb);
> +         if (tcache && tc_idx < mp_.tcache_max)
> +           {
> +             mchunkptr tc_victim;
> +             int found = 0;
> +
> +             /* While bin not empty and tcache not full, copy chunks over.  */
> +             while (tcache->counts[tc_idx] < mp_.tcache_count
> +                    && (pp = *fb) != NULL)
> +               {
> +                 do
> +                   {
> +                     tc_victim = pp;
> +                     if (tc_victim == NULL)
> +                       break;
> +                   }
> +                 while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
> +                        != tc_victim);
> +                 if (tc_victim != 0)
> +                   {
> +                     tcache_put (tc_victim, tc_idx);
> +                     ++found;
> +                   }
> +               }
> +           }
> +#endif
>            void *p = chunk2mem (victim);
>            alloc_perturb (p, bytes);
>            return p;
> @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes)
>                if (av != &main_arena)
>                 set_non_main_arena (victim);
>                check_malloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> +         /* While we're here, if we see other chunks of the same size,
> +            stash them in the tcache.  */
> +         size_t tc_idx = csize2tidx (nb);
> +         if (tcache && tc_idx < mp_.tcache_max)
> +           {
> +             mchunkptr tc_victim;
> +             int found = 0;
> +
> +             /* While bin not empty and tcache not full, copy chunks over.  */
> +             while (tcache->counts[tc_idx] < mp_.tcache_count
> +                    && (tc_victim = last (bin)) != bin)
> +               {
> +                 if (tc_victim != 0)
> +                   {
> +                     bck = tc_victim->bk;
> +                     set_inuse_bit_at_offset (tc_victim, nb);
> +                     if (av != &main_arena)
> +                       set_non_main_arena (tc_victim);
> +                     bin->bk = bck;
> +                     bck->fd = bin;
> +
> +                     tcache_put (tc_victim, tc_idx);
> +                     ++found;
> +                   }
> +               }
> +           }
> +#endif
>                void *p = chunk2mem (victim);
>                alloc_perturb (p, bytes);
>                return p;
> @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes)
>       otherwise need to expand memory to service a "small" request.
>     */
>
> +#if USE_TCACHE
> +  INTERNAL_SIZE_T tcache_nb = 0;
> +  size_t tc_idx = csize2tidx (nb);
> +  if (tcache && tc_idx < mp_.tcache_max)
> +    tcache_nb = nb;
> +  int return_cached = 0;
> +
> +  tcache_unsorted_count = 0;
> +#endif
> +
>    for (;; )
>      {
>        int iters = 0;
> @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes)
>                set_inuse_bit_at_offset (victim, size);
>                if (av != &main_arena)
>                 set_non_main_arena (victim);
> +#if USE_TCACHE
> +             /* Fill cache first, return to user only if cache fills.
> +                We may return one of these chunks later.  */
> +             if (tcache_nb
> +                 && tcache->counts[tc_idx] < mp_.tcache_count)
> +               {
> +                 tcache_put (victim, tc_idx);
> +                 return_cached = 1;
> +                 continue;
> +               }
> +             else
> +               {
> +#endif
>                check_malloced_chunk (av, victim, nb);
>                void *p = chunk2mem (victim);
>                alloc_perturb (p, bytes);
>                return p;
> +#if USE_TCACHE
> +               }
> +#endif
>              }
>
>            /* place chunk in bin */
> @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes)
>            fwd->bk = victim;
>            bck->fd = victim;
>
> +#if USE_TCACHE
> +      /* If we've processed as many chunks as we're allowed while
> +        filling the cache, return one of the cached ones.  */
> +      ++tcache_unsorted_count;
> +      if (return_cached
> +         && mp_.tcache_unsorted_limit > 0
> +         && tcache_unsorted_count > mp_.tcache_unsorted_limit)
> +       {
> +         return tcache_get (tc_idx);
> +       }
> +#endif
> +
>  #define MAX_ITERS       10000
>            if (++iters >= MAX_ITERS)
>              break;
>          }
>
> +#if USE_TCACHE
> +      /* If all the small chunks we found ended up cached, return one now.  */
> +      if (return_cached)
> +       {
> +         return tcache_get (tc_idx);
> +       }
> +#endif
> +
>        /*
>           If a large request, scan through the chunks of current bin in
>           sorted order to find smallest that fits.  Use the skip list for this.
> @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock)
>
>    check_inuse_chunk(av, p);
>
> +#if USE_TCACHE
> +  {
> +    size_t tc_idx = csize2tidx (size);
> +
> +    if (tcache
> +       && tc_idx < mp_.tcache_max
> +       && tcache->counts[tc_idx] < mp_.tcache_count)
> +      {
> +       tcache_put (p, tc_idx);
> +       return;
> +      }
> +  }
> +#endif
> +
>    /*
>      If eligible, place chunk on a fastbin so it can be found
>      and used quickly in malloc.
> @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value)
>    return 1;
>  }
>
> +#if USE_TCACHE
> +static inline int
> +__always_inline
> +do_set_tcache_max (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
> +  if (value >= 0 && value <= MAX_TCACHE_SIZE)
> +    {
> +      mp_.tcache_max_bytes = value;
> +      mp_.tcache_max = usize2tidx (value) + 1;
> +    }
> +  return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_count (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
> +  mp_.tcache_count = value;
> +  return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_unsorted_limit (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
> +  mp_.tcache_unsorted_limit = value;
> +  return 1;
> +}
> +#endif
>
>  int
>  __libc_mallopt (int param_number, int value)
> @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value)
>        if (value > 0)
>         do_set_arena_test (value);
>        break;
> +#if USE_TCACHE
> +    case M_TCACHE_COUNT:
> +      if (value >= 0)
> +       do_set_tcache_count (value);
> +      break;
> +    case M_TCACHE_MAX:
> +      if (value >= 0)
> +       do_set_tcache_max (value);
> +      break;
> +    case M_TCACHE_UNSORTED_LIMIT:
> +      if (value >= 0)
> +       do_set_tcache_unsorted_limit (value);
> +      break;
> +#endif
>      }
>    __libc_lock_unlock (av->mutex);
>    return res;
  
Carlos O'Donell May 8, 2017, 6:39 a.m. UTC | #2
On 03/10/2017 12:02 AM, DJ Delorie wrote:
> Branch updated...
> 
>     Fix whitespace around operators.
>     
>     Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper
>     
>     Add helper functions for tcache_get() and tcache_put() to collect
>     common code in one spot.
> 
> Updated patch attached.  (git diff origin/master...origin/dj/malloc-tcache)

I'm very excited to see this go upstream. Overall this is looking better
and almost there. I think one more version given these comments and I think
we can consider this ready. Please review, integrate, and produce a v3 and
repost it. Please TO: me so I can review again.

> I also put a copy of the oocalc workload online, as a sample of the
> workloads I've been using to benchmark.  It's 16M compressed, and you
> have to decompress it (to 65M) to use it with the simulator.
> 
>   http://www.delorie.com/tmp/oocalc.wl.bz2
> 
> (if you use wget, use "-U malloc" ;)

High-level review:

- Adding a thread local cache of chunks for better performance. OK.

  The reasons for tcache make sense, and the current workload benchmarks
  make sense to me. On the average having a pool of chunks per-thread to
  fetch quickly makes for better average performance across the workloads
  you have looked at. If people claim their workloads are impacted then
  the trace infrastructure can be used to capture and reproduce that workload
  to see what is going on.

Design Review:

- Adding legacy environment variables. Not OK.

  We have tunables framework now and we don't need legacy environment
  variables. Please remove the legacy environment variables and resubmit.
  The same applies for the private mallopt interface you were using.

- Missing manual documentation for new tcache tunables. Not OK.

  Please add the tcache tunables to the manual in the tunables section and
  describe in detail what they do. I don't know if we can easily have a
  conditional section in the manual? I don't really care though, just describe
  the entries and they'll get ignored if tcache isn't enabled anyway.

- Missing manual documentation for new probes. Not OK.

  I applaud the addition of new useful probes in the malloc subsystem.
  Please document the new probes and their interface in probes.texi.

- Tunables security level?

  The malloc tunables should all be 'security_level: SXID_IGNORE'.

- Code duplication?

  In _int_malloc we have some code duplication where tcache and the normal
  code both do the same thing e.g. remove a block. Can we avoid some of
  this duplication with a first refactoring patch?


Low-level review:

Follows...  

> diff --git a/config.make.in b/config.make.in
> index 5836b32..0290d83 100644
> --- a/config.make.in
> +++ b/config.make.in
> @@ -77,6 +77,8 @@ multi-arch = @multi_arch@
>  
>  mach-interface-list = @mach_interface_list@
>  
> +experimental-malloc = @experimental_malloc@
> +

OK.

>  nss-crypt = @libc_cv_nss_crypt@
>  static-nss-crypt = @libc_cv_static_nss_crypt@
>  
> diff --git a/configure.ac b/configure.ac
> index 4a77411..b929012 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch],
>  	      [multi_arch=$enableval],
>  	      [multi_arch=default])
>  
> +AC_ARG_ENABLE([experimental-malloc],
> +	      AC_HELP_STRING([--disable-experimental-malloc],
> +			     [disable experimental malloc features]),
> +	      [experimental_malloc=$enableval],
> +	      [experimental_malloc=yes])
> +AC_SUBST(experimental_malloc)

OK. Enabled by default is good IMO.

> +
>  AC_ARG_ENABLE([nss-crypt],
>  	      AC_HELP_STRING([--enable-nss-crypt],
>  			     [enable libcrypt to use nss]),
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index cb9e8f1..37620c8 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -76,5 +76,17 @@ glibc {
>        minval: 1
>        security_level: SXID_IGNORE
>      }
> +    tcache_max {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_MAX
> +    }
> +    tcache_count {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_COUNT
> +    }
> +    tcache_unsorted_limit {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT

Noted already that legacy env must be removed.

Noted already that manual needs updating to describe these.

Noted already these should be 'security_level: SXID_IGNORE' to avoid any
impact on secure processes.

Otherwise OK.

> +    }
>    }
>  }
> diff --git a/malloc/Makefile b/malloc/Makefile
> index e93b83b..dd8a43a 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
>  tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
>  tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
>  
> +ifeq ($(experimental-malloc),yes)
> +CPPFLAGS-malloc.c += -DUSE_TCACHE
> +endif

Must be -DUSE_TCACHE=1, please follow -Wundef rules:

https://sourceware.org/glibc/wiki/Wundef

Needs a else conditional to set -DUSE_TCACHE=0.

>  # Uncomment this for test releases.  For public releases it is too expensive.
>  #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
>  
> diff --git a/malloc/arena.c b/malloc/arena.c
> index d49e4a2..79e918f 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
> +#if USE_TCACHE
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
> +#endif

OK.

>  #else
>  /* Initialization routine. */
>  #include <string.h>
> @@ -322,6 +327,11 @@ ptmalloc_init (void)
>    TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
>    TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
>    TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
> +#if USE_TCACHE
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);

Overlong line. Please wrap.

e.g.

  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL,
				 set_tcache_unsorted_limit);

> +#endif

OK.

>    __libc_lock_unlock (main_arena.mutex);
>  #else
>    const char *s = NULL;
> @@ -371,7 +381,23 @@ ptmalloc_init (void)
>                    if (memcmp (envline, "ARENA_TEST", 10) == 0)
>                      __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
>                  }
> +#if USE_TCACHE
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_MAX", 10) == 0)
> +                    __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
> +                }
> +#endif

Remove.

>                break;
> +#if USE_TCACHE
> +            case 12:
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
> +                    __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
> +                }
> +	      break;
> +#endif

Remove.

>              case 15:
>                if (!__builtin_expect (__libc_enable_secure, 0))
>                  {
> @@ -381,6 +407,15 @@ ptmalloc_init (void)
>                      __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
>                  }
>                break;
> +#if USE_TCACHE
> +            case 21:
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
> +                    __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
> +                }
> +	      break;
> +#endif

Remove.

>              default:
>                break;
>              }
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index e29105c..8cd03d8 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
>  }
>  #endif
>  
> +#ifndef USE_TCACHE
> +# define USE_TCACHE 0
> +#endif

Remove, not needed because you set it 0 or 1 in the makefile.
Avoids macro-api pitfalls of accidentally undefined USE_TCACHE.

> +#if USE_TCACHE
> +/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
> +# define MAX_TCACHE_SIZE	(MALLOC_ALIGNMENT * 63)

Why use 63? Is it because 0 is reserved for the empty zero byte set? If so then just 
say "We want 63 non-zero sized entries."?

> +# define TCACHE_IDX		((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)

Since this is the maximum index can we call this TCACHE_MAX_IDX? 

> +# define size2tidx_(bytes)	(((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)

Why the trailing underscore?

> +
> +# define tidx2csize(idx)	((idx) * MALLOC_ALIGNMENT + SIZE_SZ)

Unused, but OK from a symmetry perspective for debug uses.

> +# define tidx2usize(idx)	((idx) * MALLOC_ALIGNMENT)
> +
> +/* When "x" is a user-provided size.  */
> +# define usize2tidx(x) size2tidx_ (x)
> +/* When "x" is from chunksize().  */
> +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ)
> +
> +/* Rounds up, so...
> +   idx 0   bytes 0
> +   idx 1   bytes 1..8
> +   idx 2   bytes 9..16
> +   etc.  */

This doesn't look correct for a 64-bit x86_64 system (maybe 32-bit?).

Could you please adjust the comment for 64-bit x86_64 where MALLOC_ALIGNMENT
should be 16 bytes, and indicate the comment is for that given machine.

> +
> +/* This is another arbitrary limit, which tunables can change.  */
> +# define TCACHE_FILL_COUNT 7

Comment should describe in more detail what a developer should need to
know about this.

> +#endif
> +
>  
>  /*
>    REALLOC_ZERO_BYTES_FREES should be set if a call to
> @@ -1711,6 +1738,17 @@ struct malloc_par
>  
>    /* First address handed out by MORECORE/sbrk.  */
>    char *sbrk_base;
> +
> +#if USE_TCACHE
> +  /* Maximum number of buckets to use.  */
> +  size_t tcache_max;

Can we call this tcache_max_idx to match the macro constant?

> +  size_t tcache_max_bytes;
> +  /* Maximum number of chunks in each bucket.  */
> +  size_t tcache_count;
> +  /* Maximum number of chunks to remove from the unsorted list, which
> +     don't match.  */

Which don't match what?

> +  size_t tcache_unsorted_limit;
> +#endif
>  };
>  
>  /* There are several instances of this struct ("arenas") in this
> @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ =
>    .trim_threshold = DEFAULT_TRIM_THRESHOLD,
>  #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
>    .arena_test = NARENAS_FROM_NCORES (1)
> +#if USE_TCACHE
> +  ,
> +  .tcache_count = TCACHE_FILL_COUNT,
> +  .tcache_max = TCACHE_IDX,

s/.tcache_max = TCACHE_IDX/.tcache_max_idx = TCACHE_MAX_IDX/g

> +  .tcache_max_bytes = tidx2usize (TCACHE_IDX-1),

s/TCACHE_IDX-1/TCACHE - 1/g

> +  .tcache_unsorted_limit = 0 /* No limit */

s,/* No limit */,/* No limit.  */

> +#endif
>  };
>  
> +/*  Non public mallopt parameters.  */
> +#if USE_TCACHE
> +# define M_TCACHE_COUNT  -9
> +# define M_TCACHE_MAX  -10
> +# define M_TCACHE_UNSORTED_LIMIT  -11

Remove. Not needed anymore.

> +#endif
> +
>  /* Maximum size of memory handled in fastbins.  */
>  static INTERNAL_SIZE_T global_max_fast;
>  
> @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size)
>  
>  /*------------------------ Public wrappers. --------------------------------*/
>  
> +#if USE_TCACHE
> +
> +typedef struct TCacheEntry {
> +  struct TCacheEntry *next;
> +} TCacheEntry;

Needs explaining in detail that these apparently incomplete types are layered
on top of a chunk.

Otherwise OK.

> +
> +/* There is one of these for each thread, which contains the
> +   per-thread cache (hence "TCache").  Keeping overall size low is
> +   mildly important.  Note that COUNTS and ENTRIES are redundant, this
> +   is for performance reasons.  */

Why do you say they are redundant? Because you could in theory have a thread
local pointer to TCacheEntry and walk that list when a malloc request arrives?
If so, please expand the comment to cover that.

> +typedef struct TCache {
> +  char counts[TCACHE_IDX];
> +  TCacheEntry *entries[TCACHE_IDX];
> +} TCache;

OK.

> +
> +static __thread char tcache_shutting_down = 0;

OK.

> +static __thread TCache *tcache = NULL;
> +
> +static void
> +tcache_put (mchunkptr chunk, size_t tc_idx)
> +{
> +  TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk);

Can we assert the safety of tc_idx? assert (tc_idx < TCACHE_IDX); ?

> +  e->next = tcache->entries[tc_idx];
> +  tcache->entries[tc_idx] = e;
> +  ++(tcache->counts[tc_idx]);

What prevents count overflow?

> +}
> +
> +static void *
> +tcache_get (size_t tc_idx)
> +{

Can we assert the safety of tc_idx?

> +  TCacheEntry *e = tcache->entries[tc_idx];
> +  tcache->entries[tc_idx] = e->next;
> +  --(tcache->counts[tc_idx]);

What prevents count underflow?

> +  return (void *) e;
> +}
> +
> +static void __attribute__ ((section ("__libc_thread_freeres_fn")))
> +tcache_thread_freeres (void)
> +{
> +  int i;
> +  TCache *tcache_tmp = tcache;
> +
> +  if (!tcache)
> +    return;
> +
> +  tcache = NULL;
> +
> +  for (i = 0; i < TCACHE_IDX; ++i) {
> +    while (tcache_tmp->entries[i])
> +      {
> +	TCacheEntry *e = tcache_tmp->entries[i];
> +	tcache_tmp->entries[i] = e->next;
> +	__libc_free (e);
> +      }
> +  }
> +
> +  __libc_free (tcache_tmp);
> +
> +  tcache_shutting_down = 1;
> +}

OK.

> +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
> +
> +static void
> +tcache_init(void)
> +{
> +  mstate ar_ptr;
> +  void *victim = 0;
> +  const size_t bytes = sizeof (TCache);
> +
> +  if (tcache_shutting_down)
> +    return;
> +
> +  arena_get (ar_ptr, bytes);
> +  victim = _int_malloc (ar_ptr, bytes);
> +  if (!victim && ar_ptr != NULL)
> +    {
> +      ar_ptr = arena_get_retry (ar_ptr, bytes);
> +      victim = _int_malloc (ar_ptr, bytes);
> +    }
> +
> +
> +  if (ar_ptr != NULL)
> +    __libc_lock_unlock (ar_ptr->mutex);

OK, unlock because arena_get_retry returns a locked arena.

> +
> +  if (victim)
> +    {
> +      tcache = (TCache *) victim;
> +      memset (tcache, 0, sizeof (TCache));
> +    }
> +

Please add a comment here that in a low-memory condition we will have
victim == NULL and tcache == NULL and keep calling tcache_init
at every opportunity to eventually get enough memory for the tcache.

> +}
> +
> +#define MAYBE_INIT_TCACHE() \
> +  if (__glibc_unlikely (tcache == NULL)) \
> +    tcache_init();
> +
> +#else
> +#define MAYBE_INIT_TCACHE()
> +#endif
> +

OK.

>  void *
>  __libc_malloc (size_t bytes)
>  {
> @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes)
>      = atomic_forced_read (__malloc_hook);
>    if (__builtin_expect (hook != NULL, 0))
>      return (*hook)(bytes, RETURN_ADDRESS (0));
> +#if USE_TCACHE
> +  /* int_free also calls request2size, be careful to not pad twice.  */
> +  size_t tbytes = request2size (bytes);
> +  size_t tc_idx = csize2tidx (tbytes);
> +
> +  MAYBE_INIT_TCACHE ();
> +
> +  if (tc_idx < mp_.tcache_max
> +      && tc_idx < TCACHE_IDX /* to appease gcc */
> +      && tcache
> +      && tcache->entries[tc_idx] != NULL)
> +    {
> +      return tcache_get (tc_idx);
> +    }
> +#endif

OK.

>  
>    arena_get (ar_ptr, bytes);
>  
> @@ -2943,6 +3110,8 @@ __libc_free (void *mem)
>        return;
>      }
>  
> +  MAYBE_INIT_TCACHE ();

OK.

> +
>    ar_ptr = arena_for_chunk (p);
>    _int_free (ar_ptr, p, 0);
>  }
> @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes)
>    if (chunk_is_mmapped (oldp))
>      ar_ptr = NULL;
>    else
> -    ar_ptr = arena_for_chunk (oldp);
> +    {
> +      MAYBE_INIT_TCACHE ();
> +      ar_ptr = arena_for_chunk (oldp);

OK.

> +    }
>  
>    /* Little security check which won't hurt performance: the allocator
>       never wrapps around at the end of the address space.  Therefore
> @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size)
>  
>    sz = bytes;
>  
> +  MAYBE_INIT_TCACHE ();
> +

OK.

>    arena_get (av, sz);
>    if (av)
>      {
> @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes)
>    mchunkptr fwd;                    /* misc temp for linking */
>    mchunkptr bck;                    /* misc temp for linking */
>  
> +#if USE_TCACHE
> +  size_t tcache_unsorted_count;	    /* count of unsorted chunks processed */
> +#endif

OK.

> +
>    const char *errstr = NULL;
>  
>    /*
> @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes)
>                return NULL;
>              }
>            check_remalloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> +	  /* While we're here, if we see other chunks of the same size,
> +	     stash them in the tcache.  */
> +	  size_t tc_idx = csize2tidx (nb);
> +	  if (tcache && tc_idx < mp_.tcache_max)
> +	    {
> +	      mchunkptr tc_victim;
> +	      int found = 0;

Unused 'found'

> +
> +	      /* While bin not empty and tcache not full, copy chunks over.  */
> +	      while (tcache->counts[tc_idx] < mp_.tcache_count
> +		     && (pp = *fb) != NULL)

OK, start at the fastbin associated with the allocation request size.

> +		{
> +		  do
> +		    {
> +	              tc_victim = pp;
> +	              if (tc_victim == NULL)
> +	                break;
> +	            }
> +	          while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
> +	                 != tc_victim);

OK, try to remove the fastbin ...

> +		  if (tc_victim != 0)
> +		    {
> +		      tcache_put (tc_victim, tc_idx);

... and if you succeed then put it in the tcache.

> +		      ++found;

Remove (unused).

> +	            }
> +		}
> +	    }
> +#endif

This block of code is very very similar to the fastbin block just above it.
Both blocks of code try to take a chunk out of a similar list. Is there any
way we can consolodate this? Perhaps post a first patch which factorizes
away the "get a chunk from the fastbin list" and then tcache would be smaller
here.

>            void *p = chunk2mem (victim);
>            alloc_perturb (p, bytes);
>            return p;
> @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes)
>                if (av != &main_arena)
>  		set_non_main_arena (victim);
>                check_malloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> +	  /* While we're here, if we see other chunks of the same size,
> +	     stash them in the tcache.  */

Same interaction as above but for non-fast bins.

> +	  size_t tc_idx = csize2tidx (nb);
> +	  if (tcache && tc_idx < mp_.tcache_max)
> +	    {
> +	      mchunkptr tc_victim;
> +	      int found = 0;

Unused.

> +
> +	      /* While bin not empty and tcache not full, copy chunks over.  */
> +	      while (tcache->counts[tc_idx] < mp_.tcache_count
> +		     && (tc_victim = last (bin)) != bin)
> +		{
> +		  if (tc_victim != 0)
> +		    {
> +		      bck = tc_victim->bk;
> +		      set_inuse_bit_at_offset (tc_victim, nb);
> +		      if (av != &main_arena)
> +			set_non_main_arena (tc_victim);
> +		      bin->bk = bck;
> +		      bck->fd = bin;

OK.

> +
> +		      tcache_put (tc_victim, tc_idx);
> +		      ++found;

Remove (unused).

> +	            }
> +		}
> +	    }
> +#endif
>                void *p = chunk2mem (victim);
>                alloc_perturb (p, bytes);
>                return p;
> @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes)
>       otherwise need to expand memory to service a "small" request.
>     */
>  
> +#if USE_TCACHE
> +  INTERNAL_SIZE_T tcache_nb = 0;
> +  size_t tc_idx = csize2tidx (nb);
> +  if (tcache && tc_idx < mp_.tcache_max)
> +    tcache_nb = nb;
> +  int return_cached = 0;
> +
> +  tcache_unsorted_count = 0;

We selectively apply this limit to just unsorted, but why not all the
other places where we iterate filling the cache? Because in the other
cases we have perfect size matches we know we want?

> +#endif

OK.

> +
>    for (;; )
>      {
>        int iters = 0;
> @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes)
>                set_inuse_bit_at_offset (victim, size);
>                if (av != &main_arena)
>  		set_non_main_arena (victim);
> +#if USE_TCACHE
> +	      /* Fill cache first, return to user only if cache fills.
> +		 We may return one of these chunks later.  */
> +	      if (tcache_nb
> +		  && tcache->counts[tc_idx] < mp_.tcache_count)
> +		{
> +		  tcache_put (victim, tc_idx);
> +		  return_cached = 1;
> +		  continue;
> +		}
> +	      else
> +		{
> +#endif

OK.

>                check_malloced_chunk (av, victim, nb);
>                void *p = chunk2mem (victim);
>                alloc_perturb (p, bytes);
>                return p;
> +#if USE_TCACHE
> +		}
> +#endif
>              }
>  
>            /* place chunk in bin */
> @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes)
>            fwd->bk = victim;
>            bck->fd = victim;
>  
> +#if USE_TCACHE
> +      /* If we've processed as many chunks as we're allowed while
> +	 filling the cache, return one of the cached ones.  */
> +      ++tcache_unsorted_count;
> +      if (return_cached
> +	  && mp_.tcache_unsorted_limit > 0
> +	  && tcache_unsorted_count > mp_.tcache_unsorted_limit)
> +	{
> +	  return tcache_get (tc_idx);

OK.

> +	}
> +#endif
> +
>  #define MAX_ITERS       10000
>            if (++iters >= MAX_ITERS)
>              break;
>          }
>  
> +#if USE_TCACHE
> +      /* If all the small chunks we found ended up cached, return one now.  */
> +      if (return_cached)
> +	{
> +	  return tcache_get (tc_idx);

OK.

> +	}
> +#endif
> +
>        /*
>           If a large request, scan through the chunks of current bin in
>           sorted order to find smallest that fits.  Use the skip list for this.
> @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock)
>  
>    check_inuse_chunk(av, p);
>  
> +#if USE_TCACHE
> +  {
> +    size_t tc_idx = csize2tidx (size);
> +
> +    if (tcache
> +	&& tc_idx < mp_.tcache_max
> +	&& tcache->counts[tc_idx] < mp_.tcache_count)
> +      {
> +	tcache_put (p, tc_idx);
> +	return;

OK.

> +      }
> +  }
> +#endif
> +
>    /*
>      If eligible, place chunk on a fastbin so it can be found
>      and used quickly in malloc.
> @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value)
>    return 1;
>  }
>  
> +#if USE_TCACHE
> +static inline int
> +__always_inline
> +do_set_tcache_max (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);

s/memory_mallopt_tcache_max_bytes/memory_set_tcache_max_bytes/g

> +  if (value >= 0 && value <= MAX_TCACHE_SIZE)
> +    {
> +      mp_.tcache_max_bytes = value;
> +      mp_.tcache_max = usize2tidx (value) + 1;
> +    }
> +  return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_count (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);

Likewise.

> +  mp_.tcache_count = value;
> +  return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_unsorted_limit (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);

Likewise.

> +  mp_.tcache_unsorted_limit = value;
> +  return 1;
> +}
> +#endif
>  
>  int
>  __libc_mallopt (int param_number, int value)
> @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value)
>        if (value > 0)
>  	do_set_arena_test (value);
>        break;
> +#if USE_TCACHE
> +    case M_TCACHE_COUNT:
> +      if (value >= 0)
> +	do_set_tcache_count (value);
> +      break;
> +    case M_TCACHE_MAX:
> +      if (value >= 0)
> +	do_set_tcache_max (value);
> +      break;
> +    case M_TCACHE_UNSORTED_LIMIT:
> +      if (value >= 0)
> +	do_set_tcache_unsorted_limit (value);
> +      break;
> +#endif

Remove.

>      }
>    __libc_lock_unlock (av->mutex);
>    return res;
>
  
Florian Weimer May 8, 2017, 8:11 a.m. UTC | #3
On 03/10/2017 06:02 AM, DJ Delorie wrote:

> +typedef struct TCacheEntry {
> +  struct TCacheEntry *next;
> +} TCacheEntry;
> +
> +/* There is one of these for each thread, which contains the
> +   per-thread cache (hence "TCache").  Keeping overall size low is
> +   mildly important.  Note that COUNTS and ENTRIES are redundant, this
> +   is for performance reasons.  */
> +typedef struct TCache {
> +  char counts[TCACHE_IDX];
> +  TCacheEntry *entries[TCACHE_IDX];
> +} TCache;

These type names remind me of Turbo Vision.  I don't think we use the 
CamelCase convention in glibc. :)

I would like to see some discussion of the per-thread overhead the 
thread cache introduces, maybe along the documentation of the tunables, 
both the static overhead (due to new per-thread state) and some bounds 
on the additional per-thread allocation retention.

Thanks,
Florian
  

Patch

diff --git a/config.make.in b/config.make.in
index 5836b32..0290d83 100644
--- a/config.make.in
+++ b/config.make.in
@@ -77,6 +77,8 @@  multi-arch = @multi_arch@
 
 mach-interface-list = @mach_interface_list@
 
+experimental-malloc = @experimental_malloc@
+
 nss-crypt = @libc_cv_nss_crypt@
 static-nss-crypt = @libc_cv_static_nss_crypt@
 
diff --git a/configure.ac b/configure.ac
index 4a77411..b929012 100644
--- a/configure.ac
+++ b/configure.ac
@@ -313,6 +313,13 @@  AC_ARG_ENABLE([multi-arch],
 	      [multi_arch=$enableval],
 	      [multi_arch=default])
 
+AC_ARG_ENABLE([experimental-malloc],
+	      AC_HELP_STRING([--disable-experimental-malloc],
+			     [disable experimental malloc features]),
+	      [experimental_malloc=$enableval],
+	      [experimental_malloc=yes])
+AC_SUBST(experimental_malloc)
+
 AC_ARG_ENABLE([nss-crypt],
 	      AC_HELP_STRING([--enable-nss-crypt],
 			     [enable libcrypt to use nss]),
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index cb9e8f1..37620c8 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -76,5 +76,17 @@  glibc {
       minval: 1
       security_level: SXID_IGNORE
     }
+    tcache_max {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_MAX
+    }
+    tcache_count {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_COUNT
+    }
+    tcache_unsorted_limit {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
+    }
   }
 }
diff --git a/malloc/Makefile b/malloc/Makefile
index e93b83b..dd8a43a 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -168,6 +168,9 @@  tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
 tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
 tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
 
+ifeq ($(experimental-malloc),yes)
+CPPFLAGS-malloc.c += -DUSE_TCACHE
+endif
 # Uncomment this for test releases.  For public releases it is too expensive.
 #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
 
diff --git a/malloc/arena.c b/malloc/arena.c
index d49e4a2..79e918f 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -236,6 +236,11 @@  DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
+#if USE_TCACHE
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
+#endif
 #else
 /* Initialization routine. */
 #include <string.h>
@@ -322,6 +327,11 @@  ptmalloc_init (void)
   TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
   TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
   TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
+#if USE_TCACHE
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);
+#endif
   __libc_lock_unlock (main_arena.mutex);
 #else
   const char *s = NULL;
@@ -371,7 +381,23 @@  ptmalloc_init (void)
                   if (memcmp (envline, "ARENA_TEST", 10) == 0)
                     __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
                 }
+#if USE_TCACHE
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_MAX", 10) == 0)
+                    __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
+                }
+#endif
               break;
+#if USE_TCACHE
+            case 12:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
+                    __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
+                }
+	      break;
+#endif
             case 15:
               if (!__builtin_expect (__libc_enable_secure, 0))
                 {
@@ -381,6 +407,15 @@  ptmalloc_init (void)
                     __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
                 }
               break;
+#if USE_TCACHE
+            case 21:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
+                    __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
+                }
+	      break;
+#endif
             default:
               break;
             }
diff --git a/malloc/malloc.c b/malloc/malloc.c
index e29105c..8cd03d8 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -297,6 +297,33 @@  __malloc_assert (const char *assertion, const char *file, unsigned int line,
 }
 #endif
 
+#ifndef USE_TCACHE
+# define USE_TCACHE 0
+#endif
+#if USE_TCACHE
+/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
+# define MAX_TCACHE_SIZE	(MALLOC_ALIGNMENT * 63)
+# define TCACHE_IDX		((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
+# define size2tidx_(bytes)	(((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
+
+# define tidx2csize(idx)	((idx) * MALLOC_ALIGNMENT + SIZE_SZ)
+# define tidx2usize(idx)	((idx) * MALLOC_ALIGNMENT)
+
+/* When "x" is a user-provided size.  */
+# define usize2tidx(x) size2tidx_ (x)
+/* When "x" is from chunksize().  */
+# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ)
+
+/* Rounds up, so...
+   idx 0   bytes 0
+   idx 1   bytes 1..8
+   idx 2   bytes 9..16
+   etc.  */
+
+/* This is another arbitrary limit, which tunables can change.  */
+# define TCACHE_FILL_COUNT 7
+#endif
+
 
 /*
   REALLOC_ZERO_BYTES_FREES should be set if a call to
@@ -1711,6 +1738,17 @@  struct malloc_par
 
   /* First address handed out by MORECORE/sbrk.  */
   char *sbrk_base;
+
+#if USE_TCACHE
+  /* Maximum number of buckets to use.  */
+  size_t tcache_max;
+  size_t tcache_max_bytes;
+  /* Maximum number of chunks in each bucket.  */
+  size_t tcache_count;
+  /* Maximum number of chunks to remove from the unsorted list, which
+     don't match.  */
+  size_t tcache_unsorted_limit;
+#endif
 };
 
 /* There are several instances of this struct ("arenas") in this
@@ -1749,8 +1787,22 @@  static struct malloc_par mp_ =
   .trim_threshold = DEFAULT_TRIM_THRESHOLD,
 #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
   .arena_test = NARENAS_FROM_NCORES (1)
+#if USE_TCACHE
+  ,
+  .tcache_count = TCACHE_FILL_COUNT,
+  .tcache_max = TCACHE_IDX,
+  .tcache_max_bytes = tidx2usize (TCACHE_IDX-1),
+  .tcache_unsorted_limit = 0 /* No limit */
+#endif
 };
 
+/*  Non public mallopt parameters.  */
+#if USE_TCACHE
+# define M_TCACHE_COUNT  -9
+# define M_TCACHE_MAX  -10
+# define M_TCACHE_UNSORTED_LIMIT  -11
+#endif
+
 /* Maximum size of memory handled in fastbins.  */
 static INTERNAL_SIZE_T global_max_fast;
 
@@ -2874,6 +2926,106 @@  mremap_chunk (mchunkptr p, size_t new_size)
 
 /*------------------------ Public wrappers. --------------------------------*/
 
+#if USE_TCACHE
+
+typedef struct TCacheEntry {
+  struct TCacheEntry *next;
+} TCacheEntry;
+
+/* There is one of these for each thread, which contains the
+   per-thread cache (hence "TCache").  Keeping overall size low is
+   mildly important.  Note that COUNTS and ENTRIES are redundant, this
+   is for performance reasons.  */
+typedef struct TCache {
+  char counts[TCACHE_IDX];
+  TCacheEntry *entries[TCACHE_IDX];
+} TCache;
+
+static __thread char tcache_shutting_down = 0;
+static __thread TCache *tcache = NULL;
+
+static void
+tcache_put (mchunkptr chunk, size_t tc_idx)
+{
+  TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk);
+  e->next = tcache->entries[tc_idx];
+  tcache->entries[tc_idx] = e;
+  ++(tcache->counts[tc_idx]);
+}
+
+static void *
+tcache_get (size_t tc_idx)
+{
+  TCacheEntry *e = tcache->entries[tc_idx];
+  tcache->entries[tc_idx] = e->next;
+  --(tcache->counts[tc_idx]);
+  return (void *) e;
+}
+
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+tcache_thread_freeres (void)
+{
+  int i;
+  TCache *tcache_tmp = tcache;
+
+  if (!tcache)
+    return;
+
+  tcache = NULL;
+
+  for (i = 0; i < TCACHE_IDX; ++i) {
+    while (tcache_tmp->entries[i])
+      {
+	TCacheEntry *e = tcache_tmp->entries[i];
+	tcache_tmp->entries[i] = e->next;
+	__libc_free (e);
+      }
+  }
+
+  __libc_free (tcache_tmp);
+
+  tcache_shutting_down = 1;
+}
+text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
+
+static void
+tcache_init(void)
+{
+  mstate ar_ptr;
+  void *victim = 0;
+  const size_t bytes = sizeof (TCache);
+
+  if (tcache_shutting_down)
+    return;
+
+  arena_get (ar_ptr, bytes);
+  victim = _int_malloc (ar_ptr, bytes);
+  if (!victim && ar_ptr != NULL)
+    {
+      ar_ptr = arena_get_retry (ar_ptr, bytes);
+      victim = _int_malloc (ar_ptr, bytes);
+    }
+
+
+  if (ar_ptr != NULL)
+    __libc_lock_unlock (ar_ptr->mutex);
+
+  if (victim)
+    {
+      tcache = (TCache *) victim;
+      memset (tcache, 0, sizeof (TCache));
+    }
+
+}
+
+#define MAYBE_INIT_TCACHE() \
+  if (__glibc_unlikely (tcache == NULL)) \
+    tcache_init();
+
+#else
+#define MAYBE_INIT_TCACHE()
+#endif
+
 void *
 __libc_malloc (size_t bytes)
 {
@@ -2884,6 +3036,21 @@  __libc_malloc (size_t bytes)
     = atomic_forced_read (__malloc_hook);
   if (__builtin_expect (hook != NULL, 0))
     return (*hook)(bytes, RETURN_ADDRESS (0));
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = request2size (bytes);
+  size_t tc_idx = csize2tidx (tbytes);
+
+  MAYBE_INIT_TCACHE ();
+
+  if (tc_idx < mp_.tcache_max
+      && tc_idx < TCACHE_IDX /* to appease gcc */
+      && tcache
+      && tcache->entries[tc_idx] != NULL)
+    {
+      return tcache_get (tc_idx);
+    }
+#endif
 
   arena_get (ar_ptr, bytes);
 
@@ -2943,6 +3110,8 @@  __libc_free (void *mem)
       return;
     }
 
+  MAYBE_INIT_TCACHE ();
+
   ar_ptr = arena_for_chunk (p);
   _int_free (ar_ptr, p, 0);
 }
@@ -2980,7 +3149,10 @@  __libc_realloc (void *oldmem, size_t bytes)
   if (chunk_is_mmapped (oldp))
     ar_ptr = NULL;
   else
-    ar_ptr = arena_for_chunk (oldp);
+    {
+      MAYBE_INIT_TCACHE ();
+      ar_ptr = arena_for_chunk (oldp);
+    }
 
   /* Little security check which won't hurt performance: the allocator
      never wrapps around at the end of the address space.  Therefore
@@ -3205,6 +3377,8 @@  __libc_calloc (size_t n, size_t elem_size)
 
   sz = bytes;
 
+  MAYBE_INIT_TCACHE ();
+
   arena_get (av, sz);
   if (av)
     {
@@ -3335,6 +3509,10 @@  _int_malloc (mstate av, size_t bytes)
   mchunkptr fwd;                    /* misc temp for linking */
   mchunkptr bck;                    /* misc temp for linking */
 
+#if USE_TCACHE
+  size_t tcache_unsorted_count;	    /* count of unsorted chunks processed */
+#endif
+
   const char *errstr = NULL;
 
   /*
@@ -3387,6 +3565,35 @@  _int_malloc (mstate av, size_t bytes)
               return NULL;
             }
           check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tcache && tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache->counts[tc_idx] < mp_.tcache_count
+		     && (pp = *fb) != NULL)
+		{
+		  do
+		    {
+	              tc_victim = pp;
+	              if (tc_victim == NULL)
+	                break;
+	            }
+	          while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
+	                 != tc_victim);
+		  if (tc_victim != 0)
+		    {
+		      tcache_put (tc_victim, tc_idx);
+		      ++found;
+	            }
+		}
+	    }
+#endif
           void *p = chunk2mem (victim);
           alloc_perturb (p, bytes);
           return p;
@@ -3425,6 +3632,34 @@  _int_malloc (mstate av, size_t bytes)
               if (av != &main_arena)
 		set_non_main_arena (victim);
               check_malloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tcache && tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache->counts[tc_idx] < mp_.tcache_count
+		     && (tc_victim = last (bin)) != bin)
+		{
+		  if (tc_victim != 0)
+		    {
+		      bck = tc_victim->bk;
+		      set_inuse_bit_at_offset (tc_victim, nb);
+		      if (av != &main_arena)
+			set_non_main_arena (tc_victim);
+		      bin->bk = bck;
+		      bck->fd = bin;
+
+		      tcache_put (tc_victim, tc_idx);
+		      ++found;
+	            }
+		}
+	    }
+#endif
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
@@ -3463,6 +3698,16 @@  _int_malloc (mstate av, size_t bytes)
      otherwise need to expand memory to service a "small" request.
    */
 
+#if USE_TCACHE
+  INTERNAL_SIZE_T tcache_nb = 0;
+  size_t tc_idx = csize2tidx (nb);
+  if (tcache && tc_idx < mp_.tcache_max)
+    tcache_nb = nb;
+  int return_cached = 0;
+
+  tcache_unsorted_count = 0;
+#endif
+
   for (;; )
     {
       int iters = 0;
@@ -3523,10 +3768,26 @@  _int_malloc (mstate av, size_t bytes)
               set_inuse_bit_at_offset (victim, size);
               if (av != &main_arena)
 		set_non_main_arena (victim);
+#if USE_TCACHE
+	      /* Fill cache first, return to user only if cache fills.
+		 We may return one of these chunks later.  */
+	      if (tcache_nb
+		  && tcache->counts[tc_idx] < mp_.tcache_count)
+		{
+		  tcache_put (victim, tc_idx);
+		  return_cached = 1;
+		  continue;
+		}
+	      else
+		{
+#endif
               check_malloced_chunk (av, victim, nb);
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
+#if USE_TCACHE
+		}
+#endif
             }
 
           /* place chunk in bin */
@@ -3593,11 +3854,31 @@  _int_malloc (mstate av, size_t bytes)
           fwd->bk = victim;
           bck->fd = victim;
 
+#if USE_TCACHE
+      /* If we've processed as many chunks as we're allowed while
+	 filling the cache, return one of the cached ones.  */
+      ++tcache_unsorted_count;
+      if (return_cached
+	  && mp_.tcache_unsorted_limit > 0
+	  && tcache_unsorted_count > mp_.tcache_unsorted_limit)
+	{
+	  return tcache_get (tc_idx);
+	}
+#endif
+
 #define MAX_ITERS       10000
           if (++iters >= MAX_ITERS)
             break;
         }
 
+#if USE_TCACHE
+      /* If all the small chunks we found ended up cached, return one now.  */
+      if (return_cached)
+	{
+	  return tcache_get (tc_idx);
+	}
+#endif
+
       /*
          If a large request, scan through the chunks of current bin in
          sorted order to find smallest that fits.  Use the skip list for this.
@@ -3883,6 +4164,20 @@  _int_free (mstate av, mchunkptr p, int have_lock)
 
   check_inuse_chunk(av, p);
 
+#if USE_TCACHE
+  {
+    size_t tc_idx = csize2tidx (size);
+
+    if (tcache
+	&& tc_idx < mp_.tcache_max
+	&& tcache->counts[tc_idx] < mp_.tcache_count)
+      {
+	tcache_put (p, tc_idx);
+	return;
+      }
+  }
+#endif
+
   /*
     If eligible, place chunk on a fastbin so it can be found
     and used quickly in malloc.
@@ -4844,6 +5139,38 @@  do_set_arena_max (size_t value)
   return 1;
 }
 
+#if USE_TCACHE
+static inline int
+__always_inline
+do_set_tcache_max (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
+  if (value >= 0 && value <= MAX_TCACHE_SIZE)
+    {
+      mp_.tcache_max_bytes = value;
+      mp_.tcache_max = usize2tidx (value) + 1;
+    }
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_count (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
+  mp_.tcache_count = value;
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_unsorted_limit (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
+  mp_.tcache_unsorted_limit = value;
+  return 1;
+}
+#endif
 
 int
 __libc_mallopt (int param_number, int value)
@@ -4904,6 +5231,20 @@  __libc_mallopt (int param_number, int value)
       if (value > 0)
 	do_set_arena_test (value);
       break;
+#if USE_TCACHE
+    case M_TCACHE_COUNT:
+      if (value >= 0)
+	do_set_tcache_count (value);
+      break;
+    case M_TCACHE_MAX:
+      if (value >= 0)
+	do_set_tcache_max (value);
+      break;
+    case M_TCACHE_UNSORTED_LIMIT:
+      if (value >= 0)
+	do_set_tcache_unsorted_limit (value);
+      break;
+#endif
     }
   __libc_lock_unlock (av->mutex);
   return res;