From patchwork Wed Apr 25 02:56:27 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Kemi Wang <kemi.wang@intel.com>
X-Patchwork-Id: 26932
Received: (qmail 119540 invoked by alias); 25 Apr 2018 02:59:00 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 119486 invoked by uid 89); 25 Apr 2018 02:58:59 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-25.9 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_SHORT,
	SPF_PASS autolearn=ham version=3.3.2 spammy=alarm, measures,
	unlock, thr
X-HELO: mga04.intel.com
X-Amp-Result: SKIPPED(no attachment in message)
X-Amp-File-Uploaded: False
X-ExtLoop1: 1
From: Kemi Wang <kemi.wang@intel.com>
To: Adhemerval Zanella <adhemerval.zanella@linaro.org>,
	Glibc alpha <libc-alpha@sourceware.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>,
	Tim Chen <tim.c.chen@intel.com>, Andi Kleen <andi.kleen@intel.com>,
	Ying Huang <ying.huang@intel.com>, Aaron Lu <aaron.lu@intel.com>,
	Lu Aubrey <aubrey.li@intel.com>, Kemi Wang <kemi.wang@intel.com>
Subject: [PATCH v2 2/3] benchtests: Add pthread adaptive spin mutex
	microbenchmark
Date: Wed, 25 Apr 2018 10:56:27 +0800
Message-Id: <1524624988-29141-2-git-send-email-kemi.wang@intel.com>
In-Reply-To: <1524624988-29141-1-git-send-email-kemi.wang@intel.com>
References: <1524624988-29141-1-git-send-email-kemi.wang@intel.com>

Add a microbenchmark for measuring mutex lock and unlock performance with
varying numbers of threads and varying size of a critical section. The
benchmark leverages the mutex lock and unlock operation for protecting the
critical section and measures the minimum execution time, maximum execution
time, total execution time within a fixed duration. Variants of benchmark
are run with 1, 2, 3, 4, nproc/4, nproc/2, nproc threads.

The size of critical section is determined by the times of global variable
increment which is intended to emulate the critical region of real
applications. In this microbenchmark, the number 1, 10, 100, and 1000 are
used to represent different size of critical sections in the working set.

    * benchtests/bench-mutex-adaptive-thread.c: Microbenchmark for
    adaptive spin mutex
    * benchmark/Makefile: Add adaptive spin mutex benchmark

ChangLog:
    V1->V2: new added microbenchmark, as requested by Adhemerval

Signed-off-by: Kemi Wang <kemi.wang@intel.com>
---
 ChangeLog                                |   6 +
 benchtests/Makefile                      |  36 ++++-
 benchtests/bench-mutex-adaptive-thread.c | 230 +++++++++++++++++++++++++++++++
 3 files changed, 265 insertions(+), 7 deletions(-)
 create mode 100644 benchtests/bench-mutex-adaptive-thread.c

diff --git a/ChangeLog b/ChangeLog
index 4750b11..76d2628 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2018-04-24  Kemi Wang <kemi.wang@intel.com>
 
+	* benchtests/bench-mutex-adaptive-thread.c: Microbenchmark for adaptive
+	spin mutex.
+	* benchmark/Makefile: Add adaptive spin mutex benchmark.
+
+2018-04-24  Kemi Wang <kemi.wang@intel.com>
+
 	* elf/dl-tunables.list: Add glibc.mutex.spin_count entry.
 	* manual/tunables.texi: Add glibc.mutex.spin_count description.
 	* nptl/Makefile: Add pthread_mutex_conf.c for compilation.
diff --git a/benchtests/Makefile b/benchtests/Makefile
index bcd6a9c..fcc768f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -95,10 +95,17 @@ else
 bench-malloc := $(filter malloc-%,${BENCHSET})
 endif
 
+ifeq (${BENCHSET},)
+bench-mutex := mutex-adaptive-thread
+else
+bench-mutex := $(filter mutex-%,${BENCHSET})
+endif
+
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
 $(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(addprefix $(objpfx)bench-,$(bench-mutex)): $(shared-thread-library)
 
 
 
@@ -119,6 +126,7 @@ include ../Rules
 binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
 binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset))
 binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc))
+binaries-bench-mutex := $(addprefix $(objpfx)bench-,$(bench-mutex))
 
 # The default duration: 10 seconds.
 ifndef BENCH_DURATION
@@ -142,7 +150,7 @@ endif
 # This makes sure CPPFLAGS-nonlib and CFLAGS-nonlib are passed
 # for all these modules.
 cpp-srcs-left := $(binaries-benchset:=.c) $(binaries-bench:=.c) \
-		 $(binaries-bench-malloc:=.c)
+	$(binaries-bench-malloc:=.c) $(binaries-bench-mutex:=.c)
 lib := nonlib
 include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
 
@@ -158,6 +166,7 @@ bench-clean:
 	rm -f $(binaries-bench) $(addsuffix .o,$(binaries-bench))
 	rm -f $(binaries-benchset) $(addsuffix .o,$(binaries-benchset))
 	rm -f $(binaries-bench-malloc) $(addsuffix .o,$(binaries-bench-malloc))
+	rm -f $(binaries-bench-mutex) $(addsuffix .o,$(binaries-bench-mutex))
 	rm -f $(timing-type) $(addsuffix .o,$(timing-type))
 	rm -f $(addprefix $(objpfx),$(bench-extra-objs))
 
@@ -165,7 +174,7 @@ bench-clean:
 ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
    wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread
+   malloc-thread mutex-adaptive-thread
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
@@ -176,7 +185,7 @@ endif
 
 # Define the bench target only if the target has a usable python installation.
 ifdef PYTHON
-bench: bench-build bench-set bench-func bench-malloc
+bench: bench-build bench-set bench-func bench-malloc bench-mutex
 else
 bench:
 	@echo "The bench target needs python to run."
@@ -187,10 +196,10 @@ endif
 # only if we're building natively.
 ifeq (no,$(cross-compiling))
 bench-build: $(gen-locales) $(timing-type) $(binaries-bench) \
-	$(binaries-benchset) $(binaries-bench-malloc)
+	$(binaries-benchset) $(binaries-bench-malloc) $(binaries-bench-mutex)
 else
 bench-build: $(timing-type) $(binaries-bench) $(binaries-benchset) \
-	$(binaries-bench-malloc)
+	$(binaries-bench-malloc) $(binaries-bench-mutex)
 endif
 
 bench-set: $(binaries-benchset)
@@ -207,6 +216,19 @@ bench-malloc: $(binaries-bench-malloc)
 	  done;\
 	done
 
+# Run benchmark with 1, 2, 3, nproc/2, nproc threads
+bench-mutex: $(binaries-bench-mutex)
+	for run in $^; do \
+		prev=0; \
+		for thr in 1 2 3 4 $$((`nproc` / 4)) $$((`nproc` / 2)) `nproc`; do \
+			if [ $$thr -gt $$prev -a $$thr -lt `nproc` ]; then \
+			echo "Running $${run} $${thr}"; \
+			fi; \
+			prev=$$thr; \
+	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
+	  done;\
+	done
+
 # Build and execute the benchmark functions.  This target generates JSON
 # formatted bench.out.  Each of the programs produce independent JSON output,
 # so one could even execute them individually and process it using any JSON
@@ -236,8 +258,8 @@ bench-func: $(binaries-bench)
 	fi
 
 $(timing-type) $(binaries-bench) $(binaries-benchset) \
-	$(binaries-bench-malloc): %: %.o $(objpfx)json-lib.o \
-	$(link-extra-libs-tests) \
+	$(binaries-bench-malloc) $(binaries-bench-mutex): \
+	%: %.o $(objpfx)json-lib.o $(link-extra-libs-tests) \
   $(sort $(filter $(common-objpfx)lib%,$(link-libc))) \
   $(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit)
 	$(+link-tests)
diff --git a/benchtests/bench-mutex-adaptive-thread.c b/benchtests/bench-mutex-adaptive-thread.c
new file mode 100644
index 0000000..51a92a7
--- /dev/null
+++ b/benchtests/bench-mutex-adaptive-thread.c
@@ -0,0 +1,230 @@
+/* Benchmark pthread adaptive spin mutex lock and unlock functions.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "bench-timing.h"
+#include "json-lib.h"
+
+/* Benchmark duration in seconds.  */
+#define BENCHMARK_DURATION	15
+
+#define TYPE PTHREAD_MUTEX_ADAPTIVE_NP
+
+static unsigned long long val;
+static pthread_mutexattr_t attr;
+static pthread_mutex_t mutex;
+
+#define WORKING_SET_SIZE  4
+int working_set[] = {1, 10, 100, 1000};
+
+struct thread_args
+{
+  unsigned long long iters;
+  int working_set;
+  timing_t elapsed;
+};
+
+static void init_mutex (void)
+{
+	pthread_mutexattr_init (&attr);
+	pthread_mutexattr_settype (&attr, TYPE);
+	pthread_mutex_init (&mutex, &attr);
+}
+
+static void init_parameter (int size, struct thread_args *args,
+		int num_thread)
+{
+  int i;
+  for (i = 0; i < num_thread; i++)
+{
+  memset(&args[i], 0, sizeof(struct thread_args));
+  args[i].working_set = size;
+}
+}
+
+static volatile bool timeout;
+
+static void
+alarm_handler (int signum)
+{
+  timeout = true;
+}
+
+/* Lock and unlock for protecting the critical section. */
+static unsigned long long
+mutex_benchmark_loop (int size)
+{
+  volatile int count;
+  unsigned long long iters = 0;
+
+  while (!timeout)
+    {
+      count = size;
+      pthread_mutex_lock (&mutex);
+      while (count > 0)
+        {
+          val++;
+          count--;
+        }
+      pthread_mutex_unlock (&mutex);
+      iters++;
+    }
+  return iters;
+}
+
+static void *
+benchmark_thread (void *arg)
+{
+  struct thread_args *args = (struct thread_args *) arg;
+  unsigned long long iters;
+  timing_t start, stop;
+
+  TIMING_NOW (start);
+  iters = mutex_benchmark_loop (args->working_set);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+  args->iters = iters;
+
+  return NULL;
+}
+
+static void
+do_benchmark (size_t num_thread, struct thread_args *args)
+{
+
+  if (num_thread == 1)
+    {
+      timing_t start, stop;
+
+      TIMING_NOW (start);
+      args->iters = mutex_benchmark_loop (args->working_set);
+      TIMING_NOW (stop);
+
+      TIMING_DIFF (args->elapsed, start, stop);
+    }
+  else
+    {
+      pthread_t threads[num_thread];
+
+      for (size_t i = 0; i < num_thread; i++)
+        pthread_create(&threads[i], NULL, benchmark_thread, args + i);
+
+      for (size_t i = 0; i < num_thread; i++)
+        pthread_join(threads[i], NULL);
+    }
+}
+
+static void usage(const char *name)
+{
+  fprintf (stderr, "%s: <num_thread>\n", name);
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  int i, j, num_thread = 1;
+  json_ctx_t json_ctx;
+  struct sigaction act;
+
+  if (argc == 1)
+    num_thread = 1;
+  else if (argc == 2)
+    {
+      long ret;
+
+      errno = 0;
+      ret = strtol(argv[1], NULL, 10);
+
+      if (errno || ret == 0)
+	    usage(argv[0]);
+
+      num_thread = ret;
+    }
+  else
+    usage(argv[0]);
+
+  /* Benchmark for different critical section size */
+  for (i = 0; i < WORKING_SET_SIZE; i++)
+{
+  int size = working_set[i];
+  struct thread_args args[num_thread];
+  unsigned long long iters = 0, min_iters = -1ULL, max_iters = 0;
+  double d_total_s = 0, d_total_i = 0;
+
+  timeout = false;
+  init_mutex ();
+  init_parameter (size, args, num_thread);
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "mutex");
+
+  json_attr_object_begin (&json_ctx, "");
+
+  memset (&act, 0, sizeof (act));
+  act.sa_handler = &alarm_handler;
+
+  sigaction (SIGALRM, &act, NULL);
+
+  alarm (BENCHMARK_DURATION);
+
+  do_benchmark (num_thread, args);
+
+  for (j = 0; j < num_thread; j++)
+{
+  iters = args[j].iters;
+  if (iters < min_iters)
+    min_iters = iters;
+  if (iters >= max_iters)
+    max_iters = iters;
+  d_total_i += iters;
+  TIMING_ACCUM (d_total_s, args[j].elapsed);
+}
+  json_attr_double (&json_ctx, "duration", d_total_s);
+  json_attr_double (&json_ctx, "total_iterations", d_total_i);
+  json_attr_double (&json_ctx, "min_iteration", min_iters);
+  json_attr_double (&json_ctx, "max_iteration", max_iters);
+  json_attr_double (&json_ctx, "time_per_iteration", d_total_s / d_total_i);
+  json_attr_double (&json_ctx, "threads", num_thread);
+  json_attr_double (&json_ctx, "critical_section_size", size);
+
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+  fputs("\n", (&json_ctx)->fp);
+}
+  return 0;
+}