From patchwork Fri Mar 10 05:02:17 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: DJ Delorie X-Patchwork-Id: 19505 Received: (qmail 58874 invoked by alias); 10 Mar 2017 05:02:24 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 58841 invoked by uid 89); 10 Mar 2017 05:02:23 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.9 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RP_MATCHES_RCVD, SPF_HELO_PASS autolearn=ham version=3.3.2 spammy=victim, sk:static, bucket, sk:static- X-HELO: mx1.redhat.com From: DJ Delorie To: libc-alpha@sourceware.org Cc: Will Newton Subject: [updated patch] malloc per-thread cache ready for review In-Reply-To: (message from Will Newton on Thu, 2 Mar 2017 20:06:45 +0000) Date: Fri, 10 Mar 2017 00:02:17 -0500 Message-ID: MIME-Version: 1.0 Branch updated... Fix whitespace around operators. Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper Add helper functions for tcache_get() and tcache_put() to collect common code in one spot. Updated patch attached. (git diff origin/master...origin/dj/malloc-tcache) I also put a copy of the oocalc workload online, as a sample of the workloads I've been using to benchmark. It's 16M compressed, and you have to decompress it (to 65M) to use it with the simulator. http://www.delorie.com/tmp/oocalc.wl.bz2 (if you use wget, use "-U malloc" ;) diff --git a/config.make.in b/config.make.in index 5836b32..0290d83 100644 --- a/config.make.in +++ b/config.make.in @@ -77,6 +77,8 @@ multi-arch = @multi_arch@ mach-interface-list = @mach_interface_list@ +experimental-malloc = @experimental_malloc@ + nss-crypt = @libc_cv_nss_crypt@ static-nss-crypt = @libc_cv_static_nss_crypt@ diff --git a/configure.ac b/configure.ac index 4a77411..b929012 100644 --- a/configure.ac +++ b/configure.ac @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch], [multi_arch=$enableval], [multi_arch=default]) +AC_ARG_ENABLE([experimental-malloc], + AC_HELP_STRING([--disable-experimental-malloc], + [disable experimental malloc features]), + [experimental_malloc=$enableval], + [experimental_malloc=yes]) +AC_SUBST(experimental_malloc) + AC_ARG_ENABLE([nss-crypt], AC_HELP_STRING([--enable-nss-crypt], [enable libcrypt to use nss]), diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list index cb9e8f1..37620c8 100644 --- a/elf/dl-tunables.list +++ b/elf/dl-tunables.list @@ -76,5 +76,17 @@ glibc { minval: 1 security_level: SXID_IGNORE } + tcache_max { + type: SIZE_T + env_alias: MALLOC_TCACHE_MAX + } + tcache_count { + type: SIZE_T + env_alias: MALLOC_TCACHE_COUNT + } + tcache_unsorted_limit { + type: SIZE_T + env_alias: MALLOC_TCACHE_UNSORTED_LIMIT + } } } diff --git a/malloc/Makefile b/malloc/Makefile index e93b83b..dd8a43a 100644 --- a/malloc/Makefile +++ b/malloc/Makefile @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV) tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV) +ifeq ($(experimental-malloc),yes) +CPPFLAGS-malloc.c += -DUSE_TCACHE +endif # Uncomment this for test releases. For public releases it is too expensive. #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1 diff --git a/malloc/arena.c b/malloc/arena.c index d49e4a2..79e918f 100644 --- a/malloc/arena.c +++ b/malloc/arena.c @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t) DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t) DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t) DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t) +#if USE_TCACHE +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t) +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t) +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) +#endif #else /* Initialization routine. */ #include @@ -322,6 +327,11 @@ ptmalloc_init (void) TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max); TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max); TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test); +#if USE_TCACHE + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max); + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count); + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit); +#endif __libc_lock_unlock (main_arena.mutex); #else const char *s = NULL; @@ -371,7 +381,23 @@ ptmalloc_init (void) if (memcmp (envline, "ARENA_TEST", 10) == 0) __libc_mallopt (M_ARENA_TEST, atoi (&envline[11])); } +#if USE_TCACHE + if (!__builtin_expect (__libc_enable_secure, 0)) + { + if (memcmp (envline, "TCACHE_MAX", 10) == 0) + __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11])); + } +#endif break; +#if USE_TCACHE + case 12: + if (!__builtin_expect (__libc_enable_secure, 0)) + { + if (memcmp (envline, "TCACHE_COUNT", 12) == 0) + __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13])); + } + break; +#endif case 15: if (!__builtin_expect (__libc_enable_secure, 0)) { @@ -381,6 +407,15 @@ ptmalloc_init (void) __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16])); } break; +#if USE_TCACHE + case 21: + if (!__builtin_expect (__libc_enable_secure, 0)) + { + if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0) + __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22])); + } + break; +#endif default: break; } diff --git a/malloc/malloc.c b/malloc/malloc.c index e29105c..8cd03d8 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, } #endif +#ifndef USE_TCACHE +# define USE_TCACHE 0 +#endif +#if USE_TCACHE +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */ +# define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63) +# define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1) +# define size2tidx_(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) + +# define tidx2csize(idx) ((idx) * MALLOC_ALIGNMENT + SIZE_SZ) +# define tidx2usize(idx) ((idx) * MALLOC_ALIGNMENT) + +/* When "x" is a user-provided size. */ +# define usize2tidx(x) size2tidx_ (x) +/* When "x" is from chunksize(). */ +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ) + +/* Rounds up, so... + idx 0 bytes 0 + idx 1 bytes 1..8 + idx 2 bytes 9..16 + etc. */ + +/* This is another arbitrary limit, which tunables can change. */ +# define TCACHE_FILL_COUNT 7 +#endif + /* REALLOC_ZERO_BYTES_FREES should be set if a call to @@ -1711,6 +1738,17 @@ struct malloc_par /* First address handed out by MORECORE/sbrk. */ char *sbrk_base; + +#if USE_TCACHE + /* Maximum number of buckets to use. */ + size_t tcache_max; + size_t tcache_max_bytes; + /* Maximum number of chunks in each bucket. */ + size_t tcache_count; + /* Maximum number of chunks to remove from the unsorted list, which + don't match. */ + size_t tcache_unsorted_limit; +#endif }; /* There are several instances of this struct ("arenas") in this @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ = .trim_threshold = DEFAULT_TRIM_THRESHOLD, #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8)) .arena_test = NARENAS_FROM_NCORES (1) +#if USE_TCACHE + , + .tcache_count = TCACHE_FILL_COUNT, + .tcache_max = TCACHE_IDX, + .tcache_max_bytes = tidx2usize (TCACHE_IDX-1), + .tcache_unsorted_limit = 0 /* No limit */ +#endif }; +/* Non public mallopt parameters. */ +#if USE_TCACHE +# define M_TCACHE_COUNT -9 +# define M_TCACHE_MAX -10 +# define M_TCACHE_UNSORTED_LIMIT -11 +#endif + /* Maximum size of memory handled in fastbins. */ static INTERNAL_SIZE_T global_max_fast; @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size) /*------------------------ Public wrappers. --------------------------------*/ +#if USE_TCACHE + +typedef struct TCacheEntry { + struct TCacheEntry *next; +} TCacheEntry; + +/* There is one of these for each thread, which contains the + per-thread cache (hence "TCache"). Keeping overall size low is + mildly important. Note that COUNTS and ENTRIES are redundant, this + is for performance reasons. */ +typedef struct TCache { + char counts[TCACHE_IDX]; + TCacheEntry *entries[TCACHE_IDX]; +} TCache; + +static __thread char tcache_shutting_down = 0; +static __thread TCache *tcache = NULL; + +static void +tcache_put (mchunkptr chunk, size_t tc_idx) +{ + TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk); + e->next = tcache->entries[tc_idx]; + tcache->entries[tc_idx] = e; + ++(tcache->counts[tc_idx]); +} + +static void * +tcache_get (size_t tc_idx) +{ + TCacheEntry *e = tcache->entries[tc_idx]; + tcache->entries[tc_idx] = e->next; + --(tcache->counts[tc_idx]); + return (void *) e; +} + +static void __attribute__ ((section ("__libc_thread_freeres_fn"))) +tcache_thread_freeres (void) +{ + int i; + TCache *tcache_tmp = tcache; + + if (!tcache) + return; + + tcache = NULL; + + for (i = 0; i < TCACHE_IDX; ++i) { + while (tcache_tmp->entries[i]) + { + TCacheEntry *e = tcache_tmp->entries[i]; + tcache_tmp->entries[i] = e->next; + __libc_free (e); + } + } + + __libc_free (tcache_tmp); + + tcache_shutting_down = 1; +} +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres); + +static void +tcache_init(void) +{ + mstate ar_ptr; + void *victim = 0; + const size_t bytes = sizeof (TCache); + + if (tcache_shutting_down) + return; + + arena_get (ar_ptr, bytes); + victim = _int_malloc (ar_ptr, bytes); + if (!victim && ar_ptr != NULL) + { + ar_ptr = arena_get_retry (ar_ptr, bytes); + victim = _int_malloc (ar_ptr, bytes); + } + + + if (ar_ptr != NULL) + __libc_lock_unlock (ar_ptr->mutex); + + if (victim) + { + tcache = (TCache *) victim; + memset (tcache, 0, sizeof (TCache)); + } + +} + +#define MAYBE_INIT_TCACHE() \ + if (__glibc_unlikely (tcache == NULL)) \ + tcache_init(); + +#else +#define MAYBE_INIT_TCACHE() +#endif + void * __libc_malloc (size_t bytes) { @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes) = atomic_forced_read (__malloc_hook); if (__builtin_expect (hook != NULL, 0)) return (*hook)(bytes, RETURN_ADDRESS (0)); +#if USE_TCACHE + /* int_free also calls request2size, be careful to not pad twice. */ + size_t tbytes = request2size (bytes); + size_t tc_idx = csize2tidx (tbytes); + + MAYBE_INIT_TCACHE (); + + if (tc_idx < mp_.tcache_max + && tc_idx < TCACHE_IDX /* to appease gcc */ + && tcache + && tcache->entries[tc_idx] != NULL) + { + return tcache_get (tc_idx); + } +#endif arena_get (ar_ptr, bytes); @@ -2943,6 +3110,8 @@ __libc_free (void *mem) return; } + MAYBE_INIT_TCACHE (); + ar_ptr = arena_for_chunk (p); _int_free (ar_ptr, p, 0); } @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes) if (chunk_is_mmapped (oldp)) ar_ptr = NULL; else - ar_ptr = arena_for_chunk (oldp); + { + MAYBE_INIT_TCACHE (); + ar_ptr = arena_for_chunk (oldp); + } /* Little security check which won't hurt performance: the allocator never wrapps around at the end of the address space. Therefore @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size) sz = bytes; + MAYBE_INIT_TCACHE (); + arena_get (av, sz); if (av) { @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes) mchunkptr fwd; /* misc temp for linking */ mchunkptr bck; /* misc temp for linking */ +#if USE_TCACHE + size_t tcache_unsorted_count; /* count of unsorted chunks processed */ +#endif + const char *errstr = NULL; /* @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes) return NULL; } check_remalloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_max) + { + mchunkptr tc_victim; + int found = 0; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache->counts[tc_idx] < mp_.tcache_count + && (pp = *fb) != NULL) + { + do + { + tc_victim = pp; + if (tc_victim == NULL) + break; + } + while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim)) + != tc_victim); + if (tc_victim != 0) + { + tcache_put (tc_victim, tc_idx); + ++found; + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes) if (av != &main_arena) set_non_main_arena (victim); check_malloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_max) + { + mchunkptr tc_victim; + int found = 0; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache->counts[tc_idx] < mp_.tcache_count + && (tc_victim = last (bin)) != bin) + { + if (tc_victim != 0) + { + bck = tc_victim->bk; + set_inuse_bit_at_offset (tc_victim, nb); + if (av != &main_arena) + set_non_main_arena (tc_victim); + bin->bk = bck; + bck->fd = bin; + + tcache_put (tc_victim, tc_idx); + ++found; + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes) otherwise need to expand memory to service a "small" request. */ +#if USE_TCACHE + INTERNAL_SIZE_T tcache_nb = 0; + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_max) + tcache_nb = nb; + int return_cached = 0; + + tcache_unsorted_count = 0; +#endif + for (;; ) { int iters = 0; @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes) set_inuse_bit_at_offset (victim, size); if (av != &main_arena) set_non_main_arena (victim); +#if USE_TCACHE + /* Fill cache first, return to user only if cache fills. + We may return one of these chunks later. */ + if (tcache_nb + && tcache->counts[tc_idx] < mp_.tcache_count) + { + tcache_put (victim, tc_idx); + return_cached = 1; + continue; + } + else + { +#endif check_malloced_chunk (av, victim, nb); void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; +#if USE_TCACHE + } +#endif } /* place chunk in bin */ @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes) fwd->bk = victim; bck->fd = victim; +#if USE_TCACHE + /* If we've processed as many chunks as we're allowed while + filling the cache, return one of the cached ones. */ + ++tcache_unsorted_count; + if (return_cached + && mp_.tcache_unsorted_limit > 0 + && tcache_unsorted_count > mp_.tcache_unsorted_limit) + { + return tcache_get (tc_idx); + } +#endif + #define MAX_ITERS 10000 if (++iters >= MAX_ITERS) break; } +#if USE_TCACHE + /* If all the small chunks we found ended up cached, return one now. */ + if (return_cached) + { + return tcache_get (tc_idx); + } +#endif + /* If a large request, scan through the chunks of current bin in sorted order to find smallest that fits. Use the skip list for this. @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock) check_inuse_chunk(av, p); +#if USE_TCACHE + { + size_t tc_idx = csize2tidx (size); + + if (tcache + && tc_idx < mp_.tcache_max + && tcache->counts[tc_idx] < mp_.tcache_count) + { + tcache_put (p, tc_idx); + return; + } + } +#endif + /* If eligible, place chunk on a fastbin so it can be found and used quickly in malloc. @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value) return 1; } +#if USE_TCACHE +static inline int +__always_inline +do_set_tcache_max (size_t value) +{ + LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); + if (value >= 0 && value <= MAX_TCACHE_SIZE) + { + mp_.tcache_max_bytes = value; + mp_.tcache_max = usize2tidx (value) + 1; + } + return 1; +} + +static inline int +__always_inline +do_set_tcache_count (size_t value) +{ + LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count); + mp_.tcache_count = value; + return 1; +} + +static inline int +__always_inline +do_set_tcache_unsorted_limit (size_t value) +{ + LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); + mp_.tcache_unsorted_limit = value; + return 1; +} +#endif int __libc_mallopt (int param_number, int value) @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value) if (value > 0) do_set_arena_test (value); break; +#if USE_TCACHE + case M_TCACHE_COUNT: + if (value >= 0) + do_set_tcache_count (value); + break; + case M_TCACHE_MAX: + if (value >= 0) + do_set_tcache_max (value); + break; + case M_TCACHE_UNSORTED_LIMIT: + if (value >= 0) + do_set_tcache_unsorted_limit (value); + break; +#endif } __libc_lock_unlock (av->mutex); return res;