malloc per-thread cache ready for review

Message ID xnpoj9mxg9.fsf@greed.delorie.com
State Committed
Headers

Commit Message

DJ Delorie Jan. 26, 2017, 7:29 p.m. UTC
  tl;dr...

I'd like to commit this when 2.26 opens for new patches.  It adds a
per-thread cache to malloc.

git checkout dj/malloc-tcache
https://sourceware.org/git/?p=glibc.git;a=shortlog;h=refs/heads/dj/malloc-tcache

git diff origin/master...origin/dj/malloc-tcache

Tunables:

glibc.malloc.tcache_max=<size_t> - largest cached chunk in bytes
glibc.malloc.tcache_count=<size_t> - most cached chunks per bin
glibc.malloc.tcache_unsorted_limit=<size_t> - how far to search the
	unsorted list for additional cacheable blocks.  (default 0 =
	unlimited)

----------------------------------------
Long version...

Most of you are probably aware that I've been working on malloc
performance improvements for a while now.  The first solid improvement
is ready for review, and I'd like to get it reviewed and ready for
commit when 2.26 opens up for new work.

This change adds a small per-thread cache (called "tcache") to the
standard malloc, which allows many of the malloc/free requests to
avoid doing *any* atomics or locks, shortening the "fast path"
significantly.  The tcache has a maximum block size it caches
(tcache_max, default (and maximum) about 504 bytes on 32-bit systems,
1008 on 64-bit systems) and a maximum number of blocks per bucket
(tcache_count, default 7).  When the tcache has a suitable block,
malloc can return it immediately.  Free can store it immediately if
there's space in the tcache.  If malloc can't use the cache, once it
takes the locks on an arena, it claims additional blocks to pre-fill
the cache while it's looking for a returnable block.

Tested on x86-64 and x86-32, including replacing the system glibc.

Performance-wise, I've seen this code run on average in 83% of the
time of baseline code (i.e. 1.2x faster), see this post for details:
https://sourceware.org/ml/libc-alpha/2017-01/msg00452.html

RSS size is about the same with this change, with a few outliers in
tests-worst-case benchmarks, of course.

The patch re-enables the "experimental malloc" configure option, and
defaults it to "yes".  Disabling it disables the tcache code.

  --disable-experimental-malloc
                          disable experimental malloc features

and adds three tunables:

    tcache_max {
      type: SIZE_T
      env_alias: MALLOC_TCACHE_MAX
    }
    tcache_count {
      type: SIZE_T
      env_alias: MALLOC_TCACHE_COUNT
    }
    tcache_unsorted_limit {
      type: SIZE_T
      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
    }

The tcache is layered on top of the existing fastbins and smallbins.
Since the tcache is small, fastbins still provide a performance
improvement (albeit less of one) and should not be removed at this
time.  Think of them as L3/L2/L1 caches ;-)

Patch attached.

	* config.make.in: Add experimental-malloc option.
	* configure.ac: Likewise.  Default to enabled.
	* configure: Regenerate.
	* elf/dl-tunables.list: Add per-thread cache tunables.
	* malloc/Makefile: Enable per-thread cache if experimental
	malloc is enabled.
	* malloc/arena.c: Add per-thread tunables.
	* malloc/malloc.c: Add per-thread cache.
  

Comments

Szabolcs Nagy Jan. 27, 2017, 2:39 p.m. UTC | #1
On 26/01/17 19:29, DJ Delorie wrote:
> +typedef struct TCache {
> +  /* 0 = uninitted, 1 = normal, anything else = shutting down */
> +  char initted;
> +  char counts[TCACHE_IDX];
> +  TCacheEntry *entries[TCACHE_IDX];
> +} TCache;
> +
> +static __thread TCache tcache = {0,0,0,{0},{0}};

this initializer seems to have two excess 0

(i'd just remove the initializer if it's static __thread)
  

Patch

diff --git a/config.make.in b/config.make.in
index 5836b32..0290d83 100644
--- a/config.make.in
+++ b/config.make.in
@@ -77,6 +77,8 @@  multi-arch = @multi_arch@
 
 mach-interface-list = @mach_interface_list@
 
+experimental-malloc = @experimental_malloc@
+
 nss-crypt = @libc_cv_nss_crypt@
 static-nss-crypt = @libc_cv_static_nss_crypt@
 
diff --git a/configure.ac b/configure.ac
index 4a77411..b929012 100644
--- a/configure.ac
+++ b/configure.ac
@@ -313,6 +313,13 @@  AC_ARG_ENABLE([multi-arch],
 	      [multi_arch=$enableval],
 	      [multi_arch=default])
 
+AC_ARG_ENABLE([experimental-malloc],
+	      AC_HELP_STRING([--disable-experimental-malloc],
+			     [disable experimental malloc features]),
+	      [experimental_malloc=$enableval],
+	      [experimental_malloc=yes])
+AC_SUBST(experimental_malloc)
+
 AC_ARG_ENABLE([nss-crypt],
 	      AC_HELP_STRING([--enable-nss-crypt],
 			     [enable libcrypt to use nss]),
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index d8cd912..3e49875 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -64,5 +64,17 @@  glibc {
       env_alias: MALLOC_ARENA_TEST
       minval: 1
     }
+    tcache_max {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_MAX
+    }
+    tcache_count {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_COUNT
+    }
+    tcache_unsorted_limit {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
+    }
   }
 }
diff --git a/malloc/Makefile b/malloc/Makefile
index e93b83b..dd8a43a 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -168,6 +168,9 @@  tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
 tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
 tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
 
+ifeq ($(experimental-malloc),yes)
+CPPFLAGS-malloc.c += -DUSE_TCACHE
+endif
 # Uncomment this for test releases.  For public releases it is too expensive.
 #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
 
diff --git a/malloc/arena.c b/malloc/arena.c
index b91d7d6..74616df 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -236,6 +236,9 @@  DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
 #else
 /* Initialization routine. */
 #include <string.h>
@@ -322,6 +325,11 @@  ptmalloc_init (void)
   TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
   TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
   TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
+#if USE_TCACHE
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);
+#endif
   __libc_lock_unlock (main_arena.mutex);
 #else
   const char *s = NULL;
@@ -371,7 +379,23 @@  ptmalloc_init (void)
                   if (memcmp (envline, "ARENA_TEST", 10) == 0)
                     __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
                 }
+#if USE_TCACHE
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_MAX", 10) == 0)
+                    __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
+                }
+#endif
               break;
+#if USE_TCACHE
+            case 12:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
+                    __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
+                }
+	      break;
+#endif
             case 15:
               if (!__builtin_expect (__libc_enable_secure, 0))
                 {
@@ -381,6 +405,15 @@  ptmalloc_init (void)
                     __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
                 }
               break;
+#if USE_TCACHE
+            case 21:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
+                    __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
+                }
+	      break;
+#endif
             default:
               break;
             }
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 4885793..b618be8 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -297,6 +297,33 @@  __malloc_assert (const char *assertion, const char *file, unsigned int line,
 }
 #endif
 
+#ifndef USE_TCACHE
+# define USE_TCACHE 0
+#endif
+#if USE_TCACHE
+/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
+# define MAX_TCACHE_SIZE	(MALLOC_ALIGNMENT * 63)
+# define TCACHE_IDX		((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
+# define size2tidx_(bytes)	(((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
+
+# define tidx2csize(idx)	((idx)*MALLOC_ALIGNMENT + SIZE_SZ)
+# define tidx2usize(idx)	((idx)*MALLOC_ALIGNMENT)
+
+/* When "x" is a user-provided size.  */
+# define usize2tidx(x) size2tidx_(x)
+/* When "x" is from chunksize().  */
+# define csize2tidx(x) size2tidx_((x)-SIZE_SZ)
+
+/* Rounds up, so...
+   idx 0   bytes 0
+   idx 1   bytes 1..8
+   idx 2   bytes 9..16
+   etc.  */
+
+/* This is another arbitrary limit, which tunables can change.  */
+# define TCACHE_FILL_COUNT 7
+#endif
+
 
 /*
   REALLOC_ZERO_BYTES_FREES should be set if a call to
@@ -1711,6 +1738,17 @@  struct malloc_par
 
   /* First address handed out by MORECORE/sbrk.  */
   char *sbrk_base;
+
+#if USE_TCACHE
+  /* Maximum number of buckets to use.  */
+  size_t tcache_max;
+  size_t tcache_max_bytes;
+  /* Maximum number of chunks in each bucket.  */
+  size_t tcache_count;
+  /* Maximum number of chunks to remove from the unsorted list, which
+     don't match.  */
+  size_t tcache_unsorted_limit;
+#endif
 };
 
 /* There are several instances of this struct ("arenas") in this
@@ -1749,8 +1787,22 @@  static struct malloc_par mp_ =
   .trim_threshold = DEFAULT_TRIM_THRESHOLD,
 #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
   .arena_test = NARENAS_FROM_NCORES (1)
+#if USE_TCACHE
+  ,
+  .tcache_count = TCACHE_FILL_COUNT,
+  .tcache_max = TCACHE_IDX,
+  .tcache_max_bytes = tidx2usize (TCACHE_IDX),
+  .tcache_unsorted_limit = 0 /* No limit */
+#endif
 };
 
+/*  Non public mallopt parameters.  */
+#if USE_TCACHE
+# define M_TCACHE_COUNT  -9
+# define M_TCACHE_MAX  -10
+# define M_TCACHE_UNSORTED_LIMIT  -11
+#endif
+
 /* Maximum size of memory handled in fastbins.  */
 static INTERNAL_SIZE_T global_max_fast;
 
@@ -2874,6 +2926,38 @@  mremap_chunk (mchunkptr p, size_t new_size)
 
 /*------------------------ Public wrappers. --------------------------------*/
 
+#if USE_TCACHE
+
+typedef struct TCacheEntry {
+  struct TCacheEntry *next;
+} TCacheEntry;
+
+/* There is one of these for each thread, which contains the
+   per-thread cache (hence "TCache").  Keeping overall size low is
+   mildly important.  INITTED makes sure we don't use the cache when
+   we're shutting down the thread.  Note that COUNTS and ENTRIES are
+   redundant, this is for performance reasons.  */
+typedef struct TCache {
+  /* 0 = uninitted, 1 = normal, anything else = shutting down */
+  char initted;
+  char counts[TCACHE_IDX];
+  TCacheEntry *entries[TCACHE_IDX];
+} TCache;
+
+static __thread TCache tcache = {0,0,0,{0},{0}};
+
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+tcache_thread_freeres (void)
+{
+  /* We should flush the cache out to the main arena also, but most of
+     the time we're just exiting the process completely.  */
+  if (tcache.initted == 1)
+    tcache.initted = 2;
+}
+text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
+
+#endif
+
 void *
 __libc_malloc (size_t bytes)
 {
@@ -2884,6 +2968,22 @@  __libc_malloc (size_t bytes)
     = atomic_forced_read (__malloc_hook);
   if (__builtin_expect (hook != NULL, 0))
     return (*hook)(bytes, RETURN_ADDRESS (0));
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = request2size (bytes);
+  size_t tc_idx = csize2tidx (tbytes);
+
+  if (tc_idx < mp_.tcache_max
+      && tc_idx < TCACHE_IDX /* to appease gcc */
+      && tcache.entries[tc_idx] != NULL
+      && tcache.initted == 1)
+    {
+      TCacheEntry *e = tcache.entries[tc_idx];
+      tcache.entries[tc_idx] = e->next;
+      --tcache.counts[tc_idx];
+      return (void *) e;
+    }
+#endif
 
   arena_get (ar_ptr, bytes);
 
@@ -3335,6 +3435,10 @@  _int_malloc (mstate av, size_t bytes)
   mchunkptr fwd;                    /* misc temp for linking */
   mchunkptr bck;                    /* misc temp for linking */
 
+#if USE_TCACHE
+  size_t tcache_unsorted_count;	    /* count of unsorted chunks processed */
+#endif
+
   const char *errstr = NULL;
 
   /*
@@ -3387,6 +3491,38 @@  _int_malloc (mstate av, size_t bytes)
               return NULL;
             }
           check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache.counts[tc_idx] < mp_.tcache_count
+		     && (pp = *fb) != NULL)
+		{
+		  do
+		    {
+	              tc_victim = pp;
+	              if (tc_victim == NULL)
+	                break;
+	            }
+	          while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
+	                 != tc_victim);
+		  if (tc_victim != 0)
+		    {
+		      TCacheEntry *e = (TCacheEntry *) chunk2mem (tc_victim);
+		      e->next = tcache.entries[tc_idx];
+		      tcache.entries[tc_idx] = e;
+		      ++tcache.counts[tc_idx];
+		      ++found;
+	            }
+		}
+	    }
+#endif
           void *p = chunk2mem (victim);
           alloc_perturb (p, bytes);
           return p;
@@ -3425,6 +3561,37 @@  _int_malloc (mstate av, size_t bytes)
               if (av != &main_arena)
 		set_non_main_arena (victim);
               check_malloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache.counts[tc_idx] < mp_.tcache_count
+		     && (tc_victim = last (bin)) != bin)
+		{
+		  if (tc_victim != 0)
+		    {
+		      bck = tc_victim->bk;
+		      set_inuse_bit_at_offset (tc_victim, nb);
+		      if (av != &main_arena)
+			set_non_main_arena (tc_victim);
+		      bin->bk = bck;
+		      bck->fd = bin;
+
+		      TCacheEntry *e = (TCacheEntry *) chunk2mem (tc_victim);
+		      e->next = tcache.entries[tc_idx];
+		      tcache.entries[tc_idx] = e;
+		      ++tcache.counts[tc_idx];
+		      ++found;
+	            }
+		}
+	    }
+#endif
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
@@ -3463,6 +3630,16 @@  _int_malloc (mstate av, size_t bytes)
      otherwise need to expand memory to service a "small" request.
    */
 
+#if USE_TCACHE
+  INTERNAL_SIZE_T tcache_nb = 0;
+  if (csize2tidx (nb) <= mp_.tcache_max)
+    tcache_nb = nb;
+  size_t tc_idx = csize2tidx (nb);
+  int return_cached = 0;
+
+  tcache_unsorted_count = 0;
+#endif
+
   for (;; )
     {
       int iters = 0;
@@ -3523,10 +3700,29 @@  _int_malloc (mstate av, size_t bytes)
               set_inuse_bit_at_offset (victim, size);
               if (av != &main_arena)
 		set_non_main_arena (victim);
+#if USE_TCACHE
+	      /* Fill cache first, return to user only if cache fills.
+		 We may return one of these chunks later.  */
+	      if (tcache_nb
+		  && tcache.counts[tc_idx] < mp_.tcache_count)
+		{
+		  TCacheEntry *e = (TCacheEntry *) chunk2mem (victim);
+		  e->next = tcache.entries[tc_idx];
+		  tcache.entries[tc_idx] = e;
+		  ++tcache.counts[tc_idx];
+		  return_cached = 1;
+		  continue;
+		}
+	      else
+		{
+#endif
               check_malloced_chunk (av, victim, nb);
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
+#if USE_TCACHE
+		}
+#endif
             }
 
           /* place chunk in bin */
@@ -3593,11 +3789,37 @@  _int_malloc (mstate av, size_t bytes)
           fwd->bk = victim;
           bck->fd = victim;
 
+#if USE_TCACHE
+      /* If we've processed as many chunks as we're allowed while
+	 filling the cache, return one of the cached ones.  */
+      ++tcache_unsorted_count;
+      if (return_cached
+	  && mp_.tcache_unsorted_limit > 0
+	  && tcache_unsorted_count > mp_.tcache_unsorted_limit)
+	{
+	  TCacheEntry *e = tcache.entries[tc_idx];
+	  tcache.entries[tc_idx] = e->next;
+	  --tcache.counts[tc_idx];
+	  return (void *) e;
+	}
+#endif
+
 #define MAX_ITERS       10000
           if (++iters >= MAX_ITERS)
             break;
         }
 
+#if USE_TCACHE
+      /* If all the small chunks we found ended up cached, return one now.  */
+      if (return_cached)
+	{
+	  TCacheEntry *e = tcache.entries[tc_idx];
+	  tcache.entries[tc_idx] = e->next;
+	  --tcache.counts[tc_idx];
+	  return (void *) e;
+	}
+#endif
+
       /*
          If a large request, scan through the chunks of current bin in
          sorted order to find smallest that fits.  Use the skip list for this.
@@ -3883,6 +4105,23 @@  _int_free (mstate av, mchunkptr p, int have_lock)
 
   check_inuse_chunk(av, p);
 
+#if USE_TCACHE
+  {
+    size_t tc_idx = csize2tidx (size);
+
+    if (tc_idx < mp_.tcache_max
+	&& tcache.counts[tc_idx] < mp_.tcache_count
+	&& tcache.initted == 1)
+      {
+	TCacheEntry *e = (TCacheEntry *) chunk2mem (p);
+	e->next = tcache.entries[tc_idx];
+	tcache.entries[tc_idx] = e;
+	++tcache.counts[tc_idx];
+	return;
+      }
+  }
+#endif
+
   /*
     If eligible, place chunk on a fastbin so it can be found
     and used quickly in malloc.
@@ -4844,6 +5083,38 @@  do_set_arena_max (size_t value)
   return 1;
 }
 
+#ifdef USE_TCACHE
+static inline int
+__always_inline
+do_set_tcache_max (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
+  if (value >= 0 && value <= MAX_TCACHE_SIZE)
+    {
+      mp_.tcache_max_bytes = value;
+      mp_.tcache_max = usize2tidx (value) + 1;
+    }
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_count (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
+  mp_.tcache_count = value;
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_unsorted_limit (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
+  mp_.tcache_unsorted_limit = value;
+  return 1;
+}
+#endif
 
 int
 __libc_mallopt (int param_number, int value)
@@ -4904,6 +5175,20 @@  __libc_mallopt (int param_number, int value)
       if (value > 0)
 	do_set_arena_test (value);
       break;
+#if USE_TCACHE
+    case M_TCACHE_COUNT:
+      if (value >= 0)
+	do_set_tcache_count (value);
+      break;
+    case M_TCACHE_MAX:
+      if (value >= 0)
+	do_set_tcache_max (value);
+      break;
+    case M_TCACHE_UNSORTED_LIMIT:
+      if (value >= 0)
+	do_set_tcache_unsorted_limit (value);
+      break;
+#endif
     }
   __libc_lock_unlock (av->mutex);
   return res;