From patchwork Fri Nov  4 00:38:15 2016
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: DJ Delorie <dj@redhat.com>
X-Patchwork-Id: 17174
Received: (qmail 6199 invoked by alias); 4 Nov 2016 00:38:28 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 6188 invoked by uid 89); 4 Nov 2016 00:38:28 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-4.2 required=5.0 tests=BAYES_00,
	RP_MATCHES_RCVD,
	SPF_HELO_PASS autolearn=ham version=3.3.2 spammy=eligible,
	1596, 1599, expense
X-HELO: mx1.redhat.com
Date: Thu, 03 Nov 2016 20:38:15 -0400
Message-Id: <xn37j8krk8.fsf@greed.delorie.com>
From: DJ Delorie <dj@redhat.com>
To: libc-alpha@sourceware.org
Subject: dj/malloc-tcache: per-thread cache -  new feature branch, for review

I'd like to announce the creation of the dj/malloc-tcache branch,
which contains a broken-out copy of the per-thread cache feature I
developed in dj/malloc (which has other malloc-related changes as
well).  At this time, I invite others to review this patch for
possible inclusion in a future glibc release, or at least to play with
it and have some fun at my expense ;-)

General benchmarks (the dj/malloc branch has a trace capture and
workload simulator feature) show a 10-20% improvement overall, mostly
in malloc/free but calloc/realloc slow down a tiny bit overall.

Patch attached for general amusement...

commit ba163b353ee76e9dad24246cb07e47a6840cfbae
Author: DJ Delorie <dj@delorie.com>
Date:   Thu Nov 3 19:58:43 2016 -0400

    malloc per-thread cache, separated out
    
    This is the per-thread cache feature of the dj/malloc branch,
    broken out as a separate feature for review.  Enabled by
    default, use --disable-experimental-malloc to disable.
    
    mallopt() support for M_TCACHE_COUNT and M_TCACHE_MAX
    is added for tuning.

diff --git a/config.make.in b/config.make.in
index 04a8b3e..86717fe 100644
--- a/config.make.in
+++ b/config.make.in
@@ -74,6 +74,8 @@ multi-arch = @multi_arch@
 
 mach-interface-list = @mach_interface_list@
 
+experimental-malloc = @experimental_malloc@
+
 nss-crypt = @libc_cv_nss_crypt@
 
 # Configuration options.
diff --git a/configure b/configure
index e80e0ad..de224a9 100755
--- a/configure
+++ b/configure
@@ -666,6 +666,7 @@ build_pt_chown
 build_nscd
 link_obsolete_rpc
 libc_cv_nss_crypt
+experimental_malloc
 enable_werror
 all_warnings
 force_install
@@ -770,6 +771,7 @@ enable_kernel
 enable_all_warnings
 enable_werror
 enable_multi_arch
+enable_experimental_malloc
 enable_nss_crypt
 enable_obsolete_rpc
 enable_systemtap
@@ -1436,6 +1438,8 @@ Optional Features:
   --disable-werror        do not build with -Werror
   --enable-multi-arch     enable single DSO with optimizations for multiple
                           architectures
+  --disable-experimental-malloc
+                          disable experimental malloc features
   --enable-nss-crypt      enable libcrypt to use nss
   --enable-obsolete-rpc   build and install the obsolete RPC code for
                           link-time usage
@@ -3492,6 +3496,15 @@ else
 fi
 
 
+# Check whether --enable-experimental-malloc was given.
+if test "${enable_experimental_malloc+set}" = set; then :
+  enableval=$enable_experimental_malloc; experimental_malloc=$enableval
+else
+  experimental_malloc=yes
+fi
+
+
+
 # Check whether --enable-nss-crypt was given.
 if test "${enable_nss_crypt+set}" = set; then :
   enableval=$enable_nss_crypt; nss_crypt=$enableval
diff --git a/configure.ac b/configure.ac
index a64aeb9..9b1c552 100644
--- a/configure.ac
+++ b/configure.ac
@@ -301,6 +301,13 @@ AC_ARG_ENABLE([multi-arch],
 	      [multi_arch=$enableval],
 	      [multi_arch=default])
 
+AC_ARG_ENABLE([experimental-malloc],
+	      AC_HELP_STRING([--disable-experimental-malloc],
+			     [disable experimental malloc features]),
+	      [experimental_malloc=$enableval],
+	      [experimental_malloc=yes])
+AC_SUBST(experimental_malloc)
+
 AC_ARG_ENABLE([nss-crypt],
 	      AC_HELP_STRING([--enable-nss-crypt],
 			     [enable libcrypt to use nss]),
diff --git a/malloc/Makefile b/malloc/Makefile
index b8efcd6..94a9661 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -159,6 +159,9 @@ endif
 tst-mcheck-ENV = MALLOC_CHECK_=3
 tst-malloc-usable-ENV = MALLOC_CHECK_=3
 
+ifeq ($(experimental-malloc),yes)
+CPPFLAGS-malloc.c += -DUSE_TCACHE
+endif
 # Uncomment this for test releases.  For public releases it is too expensive.
 #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
 
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 584edbf..e57d0cd 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -297,6 +297,25 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
 }
 #endif
 
+#ifndef USE_TCACHE
+#define USE_TCACHE 0
+#endif
+#if USE_TCACHE
+/* we want 64 entries */
+#define MAX_TCACHE_SIZE		(MALLOC_ALIGNMENT * 63)
+#define TCACHE_IDX		((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
+#define size2tidx(bytes)	(((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
+
+/* Rounds up, so...
+   idx 0   bytes 0
+   idx 1   bytes 1..8
+   idx 2   bytes 9..16
+   etc
+*/
+
+#define TCACHE_FILL_COUNT 7
+#endif
+
 
 /*
   REALLOC_ZERO_BYTES_FREES should be set if a call to
@@ -1711,6 +1730,13 @@ struct malloc_par
 
   /* First address handed out by MORECORE/sbrk.  */
   char *sbrk_base;
+
+#if USE_TCACHE
+  /* Maximum number of buckets to use.  */
+  size_t tcache_max;
+  /* Maximum number of chunks in each bucket.  */
+  size_t tcache_count;
+#endif
 };
 
 /* There are several instances of this struct ("arenas") in this
@@ -1749,8 +1775,19 @@ static struct malloc_par mp_ =
   .trim_threshold = DEFAULT_TRIM_THRESHOLD,
 #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
   .arena_test = NARENAS_FROM_NCORES (1)
+#if USE_TCACHE
+  ,
+  .tcache_count = TCACHE_FILL_COUNT,
+  .tcache_max = TCACHE_IDX-1
+#endif
 };
 
+/*  Non public mallopt parameters.  */
+#if USE_TCACHE
+#define M_TCACHE_COUNT  -9
+#define M_TCACHE_MAX  -10
+#endif
+
 /* Maximum size of memory handled in fastbins.  */
 static INTERNAL_SIZE_T global_max_fast;
 
@@ -2874,6 +2911,44 @@ mremap_chunk (mchunkptr p, size_t new_size)
 
 /*------------------------ Public wrappers. --------------------------------*/
 
+#if USE_TCACHE
+
+typedef struct TCacheEntry {
+  struct TCacheEntry *next;
+} TCacheEntry;
+
+typedef struct TCache {
+  struct TCache *prev, *next;
+  char initted; /* 0 = uninitted, 1 = normal, anything else = shutting down */
+  char counts[TCACHE_IDX];
+  TCacheEntry *entries[TCACHE_IDX];
+} TCache;
+
+static TCache *tcache_list = NULL;
+__libc_lock_define_initialized (static, tcache_mutex);
+
+static __thread TCache tcache = {0,0,0,{0},{0}};
+
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+tcache_thread_freeres (void)
+{
+  if (tcache.initted == 1)
+    {
+      __libc_lock_lock (tcache_mutex);
+      tcache.initted = 2;
+      if (tcache.next)
+	tcache.next->prev = tcache.prev;
+      if (tcache.prev)
+	tcache.prev->next = tcache.next;
+      else
+	tcache_list = tcache.next;
+      __libc_lock_unlock (tcache_mutex);
+    }
+}
+text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
+
+#endif
+
 void *
 __libc_malloc (size_t bytes)
 {
@@ -2884,6 +2959,33 @@ __libc_malloc (size_t bytes)
     = atomic_forced_read (__malloc_hook);
   if (__builtin_expect (hook != NULL, 0))
     return (*hook)(bytes, RETURN_ADDRESS (0));
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = request2size(bytes);
+  size_t tc_idx = size2tidx (tbytes);
+
+  if (tcache.initted == 0)
+    {
+      tcache.initted = 1;
+      __libc_lock_lock (tcache_mutex);
+      tcache.next = tcache_list;
+      if (tcache.next)
+	tcache.next->prev = &tcache;
+      tcache_list = &tcache;
+      __libc_lock_unlock (tcache_mutex);
+    }
+
+  if (tc_idx < mp_.tcache_max
+      && tc_idx < TCACHE_IDX /* to appease gcc */
+      && tcache.entries[tc_idx] != NULL
+      && tcache.initted == 1)
+    {
+      TCacheEntry *e = tcache.entries[tc_idx];
+      tcache.entries[tc_idx] = e->next;
+      tcache.counts[tc_idx] --;
+      return (void *) e;
+    }
+#endif
 
   arena_get (ar_ptr, bytes);
 
@@ -3387,6 +3489,38 @@ _int_malloc (mstate av, size_t bytes)
               return NULL;
             }
           check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunk of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = size2tidx (nb-SIZE_SZ);
+	  if (tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache.counts[tc_idx] < mp_.tcache_count
+		     && (pp = *fb) != NULL)
+		{
+		  do
+		    {
+	              tc_victim = pp;
+	              if (tc_victim == NULL)
+	                break;
+	            }
+	          while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
+	                 != tc_victim);
+		  if (tc_victim != 0)
+		    {
+		      TCacheEntry *e = (TCacheEntry *) chunk2mem(tc_victim);
+		      e->next = tcache.entries[tc_idx];
+		      tcache.entries[tc_idx] = e;
+		      tcache.counts[tc_idx] ++;
+		      found ++;
+	            }
+		}
+	    }
+#endif
           void *p = chunk2mem (victim);
           alloc_perturb (p, bytes);
           return p;
@@ -3425,6 +3559,40 @@ _int_malloc (mstate av, size_t bytes)
               if (av != &main_arena)
 		set_non_main_arena (victim);
               check_malloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunk of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = size2tidx (nb-SIZE_SZ);
+	  if (tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache.counts[tc_idx] < mp_.tcache_count
+		     && (tc_victim = last(bin)) != bin)
+		{
+		  if (tc_victim != 0)
+		    {
+		      bck = tc_victim->bk;
+		      set_inuse_bit_at_offset (tc_victim, nb);
+		      if (av != &main_arena)
+			set_non_main_arena (tc_victim);
+		      bin->bk = bck;
+		      bck->fd = bin;
+
+		      TCacheEntry *e = (TCacheEntry *) chunk2mem(tc_victim);
+		      e->next = tcache.entries[tc_idx];
+		      tcache.entries[tc_idx] = e;
+		      tcache.counts[tc_idx] ++;
+		      found ++;
+		      //_m_printf("snarf chunk %p %lx %p %lx\n", tc_victim, nb,
+		      //	chunk_at_offset(tc_victim, nb), chunk_at_offset(tc_victim, nb)->size);
+	            }
+		}
+	      //_m_printf("%d chunks found in smallbin\n", found);
+	    }
+#endif
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
@@ -3463,6 +3631,14 @@ _int_malloc (mstate av, size_t bytes)
      otherwise need to expand memory to service a "small" request.
    */
 
+#if USE_TCACHE
+  INTERNAL_SIZE_T tcache_nb = 0;
+  if (size2tidx (nb-SIZE_SZ) <= mp_.tcache_max)
+    tcache_nb = nb;
+  size_t tc_idx = size2tidx (nb-SIZE_SZ);
+  int return_cached = 0;
+#endif
+
   for (;; )
     {
       int iters = 0;
@@ -3523,10 +3699,29 @@ _int_malloc (mstate av, size_t bytes)
               set_inuse_bit_at_offset (victim, size);
               if (av != &main_arena)
 		set_non_main_arena (victim);
+#if USE_TCACHE
+	      /* Fill cache first, return to user only if cache fills.
+		 We may return one of these chunks later.  */
+	      if (tcache_nb
+		  && tcache.counts[tc_idx] < mp_.tcache_count)
+		{
+		  TCacheEntry *e = (TCacheEntry *) chunk2mem(victim);
+		  e->next = tcache.entries[tc_idx];
+		  tcache.entries[tc_idx] = e;
+		  tcache.counts[tc_idx] ++;
+		  return_cached = 1;
+		  continue;
+		}
+	      else
+		{
+#endif
               check_malloced_chunk (av, victim, nb);
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
+#if USE_TCACHE
+		}
+#endif
             }
 
           /* place chunk in bin */
@@ -3598,6 +3793,17 @@ _int_malloc (mstate av, size_t bytes)
             break;
         }
 
+#if USE_TCACHE
+      /* If all the small chunks we found ended up cached, return one now.  */
+      if (return_cached)
+	{
+	  TCacheEntry *e = tcache.entries[tc_idx];
+	  tcache.entries[tc_idx] = e->next;
+	  tcache.counts[tc_idx] --;
+	  return (void *) e;
+	}
+#endif
+
       /*
          If a large request, scan through the chunks of current bin in
          sorted order to find smallest that fits.  Use the skip list for this.
@@ -3883,6 +4089,23 @@ _int_free (mstate av, mchunkptr p, int have_lock)
 
   check_inuse_chunk(av, p);
 
+#if USE_TCACHE
+  {
+    size_t tc_idx = size2tidx (size - SIZE_SZ);
+
+    if (tc_idx < mp_.tcache_max
+	&& tcache.counts[tc_idx] < mp_.tcache_count
+	&& tcache.initted == 1)
+      {
+	TCacheEntry *e = (TCacheEntry *) chunk2mem (p);
+	e->next = tcache.entries[tc_idx];
+	tcache.entries[tc_idx] = e;
+	tcache.counts[tc_idx] ++;
+	return;
+      }
+  }
+#endif
+
   /*
     If eligible, place chunk on a fastbin so it can be found
     and used quickly in malloc.
@@ -4904,6 +5127,26 @@ __libc_mallopt (int param_number, int value)
       if (value > 0)
 	do_set_arena_test (value);
       break;
+#if USE_TCACHE
+    case M_TCACHE_COUNT:
+      if (value >= 0)
+        {
+          LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
+          mp_.tcache_count = value;
+        }
+      break;
+    case M_TCACHE_MAX:
+      if (value >= 0)
+        {
+          value = size2tidx (value);
+	  if (value < TCACHE_IDX)
+	    {
+              LIBC_PROBE (memory_mallopt_tcache_max, 2, value, mp_.tcache_max);
+	      mp_.tcache_max = value;
+            }
+        }
+      break;
+#endif
     }
   __libc_lock_unlock (av->mutex);
   return res;