From patchwork Fri Nov 4 00:38:15 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: DJ Delorie X-Patchwork-Id: 17174 Received: (qmail 6199 invoked by alias); 4 Nov 2016 00:38:28 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 6188 invoked by uid 89); 4 Nov 2016 00:38:28 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-4.2 required=5.0 tests=BAYES_00, RP_MATCHES_RCVD, SPF_HELO_PASS autolearn=ham version=3.3.2 spammy=eligible, 1596, 1599, expense X-HELO: mx1.redhat.com Date: Thu, 03 Nov 2016 20:38:15 -0400 Message-Id: From: DJ Delorie To: libc-alpha@sourceware.org Subject: dj/malloc-tcache: per-thread cache - new feature branch, for review I'd like to announce the creation of the dj/malloc-tcache branch, which contains a broken-out copy of the per-thread cache feature I developed in dj/malloc (which has other malloc-related changes as well). At this time, I invite others to review this patch for possible inclusion in a future glibc release, or at least to play with it and have some fun at my expense ;-) General benchmarks (the dj/malloc branch has a trace capture and workload simulator feature) show a 10-20% improvement overall, mostly in malloc/free but calloc/realloc slow down a tiny bit overall. Patch attached for general amusement... commit ba163b353ee76e9dad24246cb07e47a6840cfbae Author: DJ Delorie Date: Thu Nov 3 19:58:43 2016 -0400 malloc per-thread cache, separated out This is the per-thread cache feature of the dj/malloc branch, broken out as a separate feature for review. Enabled by default, use --disable-experimental-malloc to disable. mallopt() support for M_TCACHE_COUNT and M_TCACHE_MAX is added for tuning. diff --git a/config.make.in b/config.make.in index 04a8b3e..86717fe 100644 --- a/config.make.in +++ b/config.make.in @@ -74,6 +74,8 @@ multi-arch = @multi_arch@ mach-interface-list = @mach_interface_list@ +experimental-malloc = @experimental_malloc@ + nss-crypt = @libc_cv_nss_crypt@ # Configuration options. diff --git a/configure b/configure index e80e0ad..de224a9 100755 --- a/configure +++ b/configure @@ -666,6 +666,7 @@ build_pt_chown build_nscd link_obsolete_rpc libc_cv_nss_crypt +experimental_malloc enable_werror all_warnings force_install @@ -770,6 +771,7 @@ enable_kernel enable_all_warnings enable_werror enable_multi_arch +enable_experimental_malloc enable_nss_crypt enable_obsolete_rpc enable_systemtap @@ -1436,6 +1438,8 @@ Optional Features: --disable-werror do not build with -Werror --enable-multi-arch enable single DSO with optimizations for multiple architectures + --disable-experimental-malloc + disable experimental malloc features --enable-nss-crypt enable libcrypt to use nss --enable-obsolete-rpc build and install the obsolete RPC code for link-time usage @@ -3492,6 +3496,15 @@ else fi +# Check whether --enable-experimental-malloc was given. +if test "${enable_experimental_malloc+set}" = set; then : + enableval=$enable_experimental_malloc; experimental_malloc=$enableval +else + experimental_malloc=yes +fi + + + # Check whether --enable-nss-crypt was given. if test "${enable_nss_crypt+set}" = set; then : enableval=$enable_nss_crypt; nss_crypt=$enableval diff --git a/configure.ac b/configure.ac index a64aeb9..9b1c552 100644 --- a/configure.ac +++ b/configure.ac @@ -301,6 +301,13 @@ AC_ARG_ENABLE([multi-arch], [multi_arch=$enableval], [multi_arch=default]) +AC_ARG_ENABLE([experimental-malloc], + AC_HELP_STRING([--disable-experimental-malloc], + [disable experimental malloc features]), + [experimental_malloc=$enableval], + [experimental_malloc=yes]) +AC_SUBST(experimental_malloc) + AC_ARG_ENABLE([nss-crypt], AC_HELP_STRING([--enable-nss-crypt], [enable libcrypt to use nss]), diff --git a/malloc/Makefile b/malloc/Makefile index b8efcd6..94a9661 100644 --- a/malloc/Makefile +++ b/malloc/Makefile @@ -159,6 +159,9 @@ endif tst-mcheck-ENV = MALLOC_CHECK_=3 tst-malloc-usable-ENV = MALLOC_CHECK_=3 +ifeq ($(experimental-malloc),yes) +CPPFLAGS-malloc.c += -DUSE_TCACHE +endif # Uncomment this for test releases. For public releases it is too expensive. #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1 diff --git a/malloc/malloc.c b/malloc/malloc.c index 584edbf..e57d0cd 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -297,6 +297,25 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, } #endif +#ifndef USE_TCACHE +#define USE_TCACHE 0 +#endif +#if USE_TCACHE +/* we want 64 entries */ +#define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63) +#define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1) +#define size2tidx(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) + +/* Rounds up, so... + idx 0 bytes 0 + idx 1 bytes 1..8 + idx 2 bytes 9..16 + etc +*/ + +#define TCACHE_FILL_COUNT 7 +#endif + /* REALLOC_ZERO_BYTES_FREES should be set if a call to @@ -1711,6 +1730,13 @@ struct malloc_par /* First address handed out by MORECORE/sbrk. */ char *sbrk_base; + +#if USE_TCACHE + /* Maximum number of buckets to use. */ + size_t tcache_max; + /* Maximum number of chunks in each bucket. */ + size_t tcache_count; +#endif }; /* There are several instances of this struct ("arenas") in this @@ -1749,8 +1775,19 @@ static struct malloc_par mp_ = .trim_threshold = DEFAULT_TRIM_THRESHOLD, #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8)) .arena_test = NARENAS_FROM_NCORES (1) +#if USE_TCACHE + , + .tcache_count = TCACHE_FILL_COUNT, + .tcache_max = TCACHE_IDX-1 +#endif }; +/* Non public mallopt parameters. */ +#if USE_TCACHE +#define M_TCACHE_COUNT -9 +#define M_TCACHE_MAX -10 +#endif + /* Maximum size of memory handled in fastbins. */ static INTERNAL_SIZE_T global_max_fast; @@ -2874,6 +2911,44 @@ mremap_chunk (mchunkptr p, size_t new_size) /*------------------------ Public wrappers. --------------------------------*/ +#if USE_TCACHE + +typedef struct TCacheEntry { + struct TCacheEntry *next; +} TCacheEntry; + +typedef struct TCache { + struct TCache *prev, *next; + char initted; /* 0 = uninitted, 1 = normal, anything else = shutting down */ + char counts[TCACHE_IDX]; + TCacheEntry *entries[TCACHE_IDX]; +} TCache; + +static TCache *tcache_list = NULL; +__libc_lock_define_initialized (static, tcache_mutex); + +static __thread TCache tcache = {0,0,0,{0},{0}}; + +static void __attribute__ ((section ("__libc_thread_freeres_fn"))) +tcache_thread_freeres (void) +{ + if (tcache.initted == 1) + { + __libc_lock_lock (tcache_mutex); + tcache.initted = 2; + if (tcache.next) + tcache.next->prev = tcache.prev; + if (tcache.prev) + tcache.prev->next = tcache.next; + else + tcache_list = tcache.next; + __libc_lock_unlock (tcache_mutex); + } +} +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres); + +#endif + void * __libc_malloc (size_t bytes) { @@ -2884,6 +2959,33 @@ __libc_malloc (size_t bytes) = atomic_forced_read (__malloc_hook); if (__builtin_expect (hook != NULL, 0)) return (*hook)(bytes, RETURN_ADDRESS (0)); +#if USE_TCACHE + /* int_free also calls request2size, be careful to not pad twice. */ + size_t tbytes = request2size(bytes); + size_t tc_idx = size2tidx (tbytes); + + if (tcache.initted == 0) + { + tcache.initted = 1; + __libc_lock_lock (tcache_mutex); + tcache.next = tcache_list; + if (tcache.next) + tcache.next->prev = &tcache; + tcache_list = &tcache; + __libc_lock_unlock (tcache_mutex); + } + + if (tc_idx < mp_.tcache_max + && tc_idx < TCACHE_IDX /* to appease gcc */ + && tcache.entries[tc_idx] != NULL + && tcache.initted == 1) + { + TCacheEntry *e = tcache.entries[tc_idx]; + tcache.entries[tc_idx] = e->next; + tcache.counts[tc_idx] --; + return (void *) e; + } +#endif arena_get (ar_ptr, bytes); @@ -3387,6 +3489,38 @@ _int_malloc (mstate av, size_t bytes) return NULL; } check_remalloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunk of the same size, + stash them in the tcache. */ + size_t tc_idx = size2tidx (nb-SIZE_SZ); + if (tc_idx < mp_.tcache_max) + { + mchunkptr tc_victim; + int found = 0; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache.counts[tc_idx] < mp_.tcache_count + && (pp = *fb) != NULL) + { + do + { + tc_victim = pp; + if (tc_victim == NULL) + break; + } + while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim)) + != tc_victim); + if (tc_victim != 0) + { + TCacheEntry *e = (TCacheEntry *) chunk2mem(tc_victim); + e->next = tcache.entries[tc_idx]; + tcache.entries[tc_idx] = e; + tcache.counts[tc_idx] ++; + found ++; + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3425,6 +3559,40 @@ _int_malloc (mstate av, size_t bytes) if (av != &main_arena) set_non_main_arena (victim); check_malloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunk of the same size, + stash them in the tcache. */ + size_t tc_idx = size2tidx (nb-SIZE_SZ); + if (tc_idx < mp_.tcache_max) + { + mchunkptr tc_victim; + int found = 0; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache.counts[tc_idx] < mp_.tcache_count + && (tc_victim = last(bin)) != bin) + { + if (tc_victim != 0) + { + bck = tc_victim->bk; + set_inuse_bit_at_offset (tc_victim, nb); + if (av != &main_arena) + set_non_main_arena (tc_victim); + bin->bk = bck; + bck->fd = bin; + + TCacheEntry *e = (TCacheEntry *) chunk2mem(tc_victim); + e->next = tcache.entries[tc_idx]; + tcache.entries[tc_idx] = e; + tcache.counts[tc_idx] ++; + found ++; + //_m_printf("snarf chunk %p %lx %p %lx\n", tc_victim, nb, + // chunk_at_offset(tc_victim, nb), chunk_at_offset(tc_victim, nb)->size); + } + } + //_m_printf("%d chunks found in smallbin\n", found); + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3463,6 +3631,14 @@ _int_malloc (mstate av, size_t bytes) otherwise need to expand memory to service a "small" request. */ +#if USE_TCACHE + INTERNAL_SIZE_T tcache_nb = 0; + if (size2tidx (nb-SIZE_SZ) <= mp_.tcache_max) + tcache_nb = nb; + size_t tc_idx = size2tidx (nb-SIZE_SZ); + int return_cached = 0; +#endif + for (;; ) { int iters = 0; @@ -3523,10 +3699,29 @@ _int_malloc (mstate av, size_t bytes) set_inuse_bit_at_offset (victim, size); if (av != &main_arena) set_non_main_arena (victim); +#if USE_TCACHE + /* Fill cache first, return to user only if cache fills. + We may return one of these chunks later. */ + if (tcache_nb + && tcache.counts[tc_idx] < mp_.tcache_count) + { + TCacheEntry *e = (TCacheEntry *) chunk2mem(victim); + e->next = tcache.entries[tc_idx]; + tcache.entries[tc_idx] = e; + tcache.counts[tc_idx] ++; + return_cached = 1; + continue; + } + else + { +#endif check_malloced_chunk (av, victim, nb); void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; +#if USE_TCACHE + } +#endif } /* place chunk in bin */ @@ -3598,6 +3793,17 @@ _int_malloc (mstate av, size_t bytes) break; } +#if USE_TCACHE + /* If all the small chunks we found ended up cached, return one now. */ + if (return_cached) + { + TCacheEntry *e = tcache.entries[tc_idx]; + tcache.entries[tc_idx] = e->next; + tcache.counts[tc_idx] --; + return (void *) e; + } +#endif + /* If a large request, scan through the chunks of current bin in sorted order to find smallest that fits. Use the skip list for this. @@ -3883,6 +4089,23 @@ _int_free (mstate av, mchunkptr p, int have_lock) check_inuse_chunk(av, p); +#if USE_TCACHE + { + size_t tc_idx = size2tidx (size - SIZE_SZ); + + if (tc_idx < mp_.tcache_max + && tcache.counts[tc_idx] < mp_.tcache_count + && tcache.initted == 1) + { + TCacheEntry *e = (TCacheEntry *) chunk2mem (p); + e->next = tcache.entries[tc_idx]; + tcache.entries[tc_idx] = e; + tcache.counts[tc_idx] ++; + return; + } + } +#endif + /* If eligible, place chunk on a fastbin so it can be found and used quickly in malloc. @@ -4904,6 +5127,26 @@ __libc_mallopt (int param_number, int value) if (value > 0) do_set_arena_test (value); break; +#if USE_TCACHE + case M_TCACHE_COUNT: + if (value >= 0) + { + LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count); + mp_.tcache_count = value; + } + break; + case M_TCACHE_MAX: + if (value >= 0) + { + value = size2tidx (value); + if (value < TCACHE_IDX) + { + LIBC_PROBE (memory_mallopt_tcache_max, 2, value, mp_.tcache_max); + mp_.tcache_max = value; + } + } + break; +#endif } __libc_lock_unlock (av->mutex); return res;