[v2] New benchtest: pthread locks
Commit Message
v2 changes do_bench_2 to run multiple passes and provide a mean and
standard deviation as well as some other statistics, since I got tired
of running the whole benchmark multiple times and computing them
manually ;-)
From 94653e93eec9403ebf61b00179799751ce12088d Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Wed, 7 Oct 2020 17:04:12 -0400
Subject: New benchtest: pthread locks
Performance benchmarks for various posix locks: mutex, rwlock,
spinlock, condvar, and semaphore. Each test is performed with
an empty loop body or with a computationally "interesting" (i.e.
difficult to optimize away, and used just to allow lock code to
be "hidden" in the filler's CPU cycles).
Comments
Hi DJ,
here is what I find on my processor (Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz)
with your patch (note that it was under heavy load). Does it look ok?
Best regards,
Paul
"pthread_locks": {
"mutex": {
"empty": {
"duration": 1.29659e+08,
"iterations": 1e+06,
"wall_sec": 0.0393829,
"mean": 103.086,
"stdev": 18.8037,
"min_outlier": 91.389,
"min": 91.396,
"max": 130.909,
"max_outlier": 130.91
},
"filler": {
"duration": 1.81231e+08,
"iterations": 1e+06,
"wall_sec": 0.055047,
"mean": 173.598,
"stdev": 12.8244,
"min_outlier": 166.75,
"min": 166.751,
"max": 206.287,
"max_outlier": 208.943
}
},
"mutex_trylock": {
"empty": {
"duration": 8.00459e+07,
"iterations": 1e+06,
"wall_sec": 0.0243139,
"mean": 54.1088,
"stdev": 19.6554,
"min_outlier": 27.3545,
"min": 27.3548,
"max": 80.0422,
"max_outlier": 80.0459
},
"filler": {
"duration": 1.43876e+08,
"iterations": 1e+06,
"wall_sec": 0.0437009,
"mean": 134.758,
"stdev": 16.2904,
"min_outlier": 104.209,
"min": 104.217,
"max": 143.958,
"max_outlier": 144.563
}
},
"rwlock_read": {
"empty": {
"duration": 1.38636e+08,
"iterations": 1e+06,
"wall_sec": 0.0421081,
"mean": 121.985,
"stdev": 15.337,
"min_outlier": 93.4482,
"min": 99.1126,
"max": 138.636,
"max_outlier": 151.863
},
"filler": {
"duration": 2.11036e+08,
"iterations": 1e+06,
"wall_sec": 0.0641,
"mean": 186.195,
"stdev": 25.7311,
"min_outlier": 156.256,
"min": 157.995,
"max": 223.986,
"max_outlier": 227.361
}
},
"rwlock_tryread": {
"empty": {
"duration": 8.48573e+06,
"iterations": 1e+06,
"wall_sec": 0.00257683,
"mean": 12.4413,
"stdev": 12.496,
"min_outlier": 8.48572,
"min": 8.48573,
"max": 48.0057,
"max_outlier": 48.0071
},
"filler": {
"duration": 6.14027e+07,
"iterations": 100000,
"wall_sec": 0.0186498,
"mean": 99.9205,
"stdev": 124.966,
"min_outlier": 60.3437,
"min": 60.3437,
"max": 455.579,
"max_outlier": 614.027
}
},
"rwlock_write": {
"empty": {
"duration": 4.82128e+07,
"iterations": 100000,
"wall_sec": 0.0146441,
"mean": 179.078,
"stdev": 164.916,
"min_outlier": 86.77,
"min": 86.7724,
"max": 482.109,
"max_outlier": 482.128
},
"filler": {
"duration": 2.29868e+08,
"iterations": 1e+06,
"wall_sec": 0.0698202,
"mean": 222.557,
"stdev": 12.5448,
"min_outlier": 190.339,
"min": 190.977,
"max": 229.963,
"max_outlier": 243.063
}
},
"rwlock_trywrite": {
"empty": {
"duration": 4.70629e+07,
"iterations": 1e+06,
"wall_sec": 0.0142951,
"mean": 15.4493,
"stdev": 16.6618,
"min_outlier": 7.54287,
"min": 7.54287,
"max": 47.0629,
"max_outlier": 47.063
},
"filler": {
"duration": 1.0035e+08,
"iterations": 1e+06,
"wall_sec": 0.0304809,
"mean": 116.491,
"stdev": 23.9815,
"min_outlier": 97.7879,
"min": 97.9946,
"max": 158.148,
"max_outlier": 169.458
}
},
"spin_lock": {
"empty": {
"duration": 2.26345e+07,
"iterations": 1e+06,
"wall_sec": 0.00687504,
"mean": 42.3982,
"stdev": 20.8246,
"min_outlier": 22.6345,
"min": 22.6348,
"max": 62.1574,
"max_outlier": 75.329
},
"filler": {
"duration": 1.43121e+08,
"iterations": 1e+06,
"wall_sec": 0.0434711,
"mean": 130.691,
"stdev": 20.6261,
"min_outlier": 103.234,
"min": 103.728,
"max": 158.001,
"max_outlier": 160.629
}
},
"spin_trylock": {
"empty": {
"duration": 6.40642e+07,
"iterations": 1e+06,
"wall_sec": 0.019459,
"mean": 49.7264,
"stdev": 18.2296,
"min_outlier": 24.52,
"min": 24.5262,
"max": 64.0642,
"max_outlier": 77.2103
},
"filler": {
"duration": 9.69735e+07,
"iterations": 1e+06,
"wall_sec": 0.029454,
"mean": 110.624,
"stdev": 18.102,
"min_outlier": 92.2759,
"min": 96.6148,
"max": 136.139,
"max_outlier": 136.344
}
},
"sem_wait": {
"empty": {
"duration": 1.40348e+08,
"iterations": 1e+06,
"wall_sec": 0.042629,
"mean": 124.395,
"stdev": 16.9173,
"min_outlier": 100.823,
"min": 100.825,
"max": 140.354,
"max_outlier": 172.701
},
"filler": {
"duration": 2.09171e+08,
"iterations": 1e+06,
"wall_sec": 0.0635331,
"mean": 177.185,
"stdev": 19.1812,
"min_outlier": 145.352,
"min": 155.315,
"max": 209.171,
"max_outlier": 220.138
}
},
"sem_trywait": {
"empty": {
"duration": 4.02749e+07,
"iterations": 100000,
"wall_sec": 0.012233,
"mean": 7.54332,
"stdev": 0.000211187,
"min_outlier": 7.54305,
"min": 7.54305,
"max": 7.54359,
"max_outlier": 402.749
},
"filler": {
"duration": 1.00351e+08,
"iterations": 1e+06,
"wall_sec": 0.0304799,
"mean": 123.021,
"stdev": 19.3645,
"min_outlier": 99.7954,
"min": 100.351,
"max": 139.863,
"max_outlier": 139.869
}
},
"condvar": {
"empty": {
"duration": 5.1244e+07,
"iterations": 1000,
"wall_sec": 0.015655,
"mean": 45379.1,
"stdev": 5285.79,
"min_outlier": 18260.9,
"min": 31450.7,
"max": 51244,
"max_outlier": 57756.6
},
"filler": {
"duration": 4.73658e+07,
"iterations": 1000,
"wall_sec": 0.0144019,
"mean": 40550.8,
"stdev": 10623.3,
"min_outlier": 18126,
"min": 18233.3,
"max": 52647.8,
"max_outlier": 57678.7
}
},
"consumer_producer": {
"empty": {
"duration": 8.02147e+07,
"iterations": 1000,
"wall_sec": 0.0243819,
"mean": 77217.4,
"stdev": 25019.9,
"min_outlier": 51949,
"min": 52390.9,
"max": 115883,
"max_outlier": 118116
},
"filler": {
"duration": 9.22739e+07,
"iterations": 1000,
"wall_sec": 0.028044,
"mean": 82932,
"stdev": 23952.2,
"min_outlier": 52421.7,
"min": 54514.3,
"max": 112212,
"max_outlier": 115371
}
}
},
On 10/10/20 9:25 AM, DJ Delorie via Libc-alpha wrote:
>
> v2 changes do_bench_2 to run multiple passes and provide a mean and
> standard deviation as well as some other statistics, since I got tired
> of running the whole benchmark multiple times and computing them
> manually ;-)
>
> From 94653e93eec9403ebf61b00179799751ce12088d Mon Sep 17 00:00:00 2001
> From: DJ Delorie <dj@redhat.com>
> Date: Wed, 7 Oct 2020 17:04:12 -0400
> Subject: New benchtest: pthread locks
>
> Performance benchmarks for various posix locks: mutex, rwlock,
> spinlock, condvar, and semaphore. Each test is performed with
> an empty loop body or with a computationally "interesting" (i.e.
> difficult to optimize away, and used just to allow lock code to
> be "hidden" in the filler's CPU cycles).
This is good to commit, thanks for writing these tests!
> +/* The point of this benchmark is to measure the overhead of an empty
> + critical section or a small critical section. This is never going
> + to be indicative of real application performance. Instead we are
> + trying to benchmark the effects of the compiler and the runtime
> + coupled with a particular set of hardware atomic operations.
> + The numbers from this benchmark should be taken with a massive gain
> + of salt and viewed through the eyes of expert reviewers. */
That is a great comment. All benchmarks should have a goal set like
this so that it is clear to whoever evaluates them that just getting
better numbers in a specific benchmark is not good enough and one would
have to evaluate why a certain set of results supports their optimisation.
Siddhesh
@@ -31,7 +31,7 @@ ifneq (,$(filter yes,$(float128-fcts) $(float128-alias-fcts)))
bench-math += expf128 powf128 sinf128
endif
-bench-pthread := pthread_once thread_create
+bench-pthread := pthread_once thread_create pthread-locks
bench-string := ffs ffsll
@@ -109,6 +109,7 @@ $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
$(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
$(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)
+$(addprefix $(objpfx)bench-,pthread-locks): $(libm)
new file mode 100644
@@ -0,0 +1,554 @@
+/* Measure various lock acquisition times for empty critical sections.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#define TEST_NAME "pthread-locks"
+
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <sys/time.h>
+#include <math.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+/* The point of this benchmark is to measure the overhead of an empty
+ critical section or a small critical section. This is never going
+ to be indicative of real application performance. Instead we are
+ trying to benchmark the effects of the compiler and the runtime
+ coupled with a particular set of hardware atomic operations.
+ The numbers from this benchmark should be taken with a massive gain
+ of salt and viewed through the eyes of expert reviewers. */
+
+static pthread_mutex_t m;
+static pthread_rwlock_t rw;
+static pthread_cond_t cv;
+static pthread_cond_t consumer_c, producer_c;
+static int cv_done;
+static pthread_spinlock_t sp;
+static sem_t sem;
+
+typedef timing_t (*test_t)(long, int);
+
+#define START_ITERS 1000
+
+#define FILLER_GOES_HERE \
+ if (filler) \
+ do_filler ();
+
+/* Everyone loves a good fibonacci series. This isn't quite one of
+ them because we need larger values in fewer steps, in a way that
+ won't be optimized away. We're looking to approximately double the
+ total time each test iteration takes, so as to not swamp the useful
+ timings. */
+
+#pragma GCC push_options
+#pragma GCC optimize(1)
+
+static int __attribute__((noinline))
+fibonacci (int i)
+{
+ asm("");
+ if (i > 2)
+ return fibonacci (i-1) + fibonacci (i-2);
+ return 10+i;
+}
+
+static void
+do_filler (void)
+{
+ static char buf1[512], buf2[512];
+ int f = fibonacci (5);
+ memcpy (buf1, buf2, f);
+}
+
+#pragma GCC pop_options
+
+static timing_t
+test_mutex (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_mutex_init (&m, NULL);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_mutex_lock (&m);
+ FILLER_GOES_HERE;
+ pthread_mutex_unlock (&m);
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ return cur;
+}
+
+static timing_t
+test_mutex_trylock (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_mutex_init (&m, NULL);
+ pthread_mutex_lock (&m);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_mutex_trylock (&m);
+ FILLER_GOES_HERE;
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ pthread_mutex_unlock (&m);
+ return cur;
+}
+
+static timing_t
+test_rwlock_read (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_rwlock_init (&rw, NULL);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_rwlock_rdlock (&rw);
+ FILLER_GOES_HERE;
+ pthread_rwlock_unlock (&rw);
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ return cur;
+}
+
+static timing_t
+test_rwlock_tryread (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_rwlock_init (&rw, NULL);
+ pthread_rwlock_wrlock (&rw);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_rwlock_tryrdlock (&rw);
+ FILLER_GOES_HERE;
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ pthread_rwlock_unlock (&rw);
+ return cur;
+}
+
+static timing_t
+test_rwlock_write (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_rwlock_init (&rw, NULL);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_rwlock_wrlock (&rw);
+ FILLER_GOES_HERE;
+ pthread_rwlock_unlock (&rw);
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ return cur;
+}
+
+static timing_t
+test_rwlock_trywrite (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_rwlock_init (&rw, NULL);
+ pthread_rwlock_rdlock (&rw);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_rwlock_trywrlock (&rw);
+ FILLER_GOES_HERE;
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ pthread_rwlock_unlock (&rw);
+ return cur;
+}
+
+static timing_t
+test_spin_lock (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_spin_lock (&sp);
+ FILLER_GOES_HERE;
+ pthread_spin_unlock (&sp);
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ return cur;
+}
+
+static timing_t
+test_spin_trylock (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
+ pthread_spin_lock (&sp);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_spin_trylock (&sp);
+ FILLER_GOES_HERE;
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ pthread_spin_unlock (&sp);
+ return cur;
+}
+
+static timing_t
+test_sem_wait (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ sem_init (&sem, 0, 1);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ sem_post (&sem);
+ FILLER_GOES_HERE;
+ sem_wait (&sem);
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ return cur;
+}
+
+static timing_t
+test_sem_trywait (long iters, int filler)
+{
+ timing_t start, stop, cur;
+
+ sem_init (&sem, 0, 0);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ sem_trywait (&sem);
+ FILLER_GOES_HERE;
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ return cur;
+}
+
+static void *
+test_condvar_helper (void *v)
+{
+ /* This is wasteful, but the alternative is to add the overhead of a
+ mutex lock/unlock to the overall iteration (both threads) and we
+ don't want that. Ideally, this thread would run on an
+ independent processing core anyway. The ONLY goal here is to
+ minimize the time the other thread spends waiting for us. */
+ while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0)
+ pthread_cond_signal (&cv);
+
+ return NULL;
+}
+
+static timing_t
+test_condvar (long iters, int filler)
+{
+ timing_t start, stop, cur;
+ pthread_t helper_id;
+
+ pthread_mutex_init (&m, NULL);
+ pthread_cond_init (&cv, NULL);
+ pthread_mutex_lock (&m);
+
+ __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED);
+ pthread_create (&helper_id, NULL, test_condvar_helper, &iters);
+
+ TIMING_NOW (start);
+ for (long j = iters; j >= 0; --j)
+ {
+ pthread_cond_wait (&cv, &m);
+ FILLER_GOES_HERE;
+ }
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+ pthread_mutex_unlock (&m);
+ __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED);
+
+ pthread_join (helper_id, NULL);
+ return cur;
+}
+
+/* How many items are "queued" in our pretend queue. */
+static int queued = 0;
+
+typedef struct Producer_Params {
+ long iters;
+ int filler;
+} Producer_Params;
+
+/* We only benchmark the consumer thread, but both threads are doing
+ essentially the same thing, and never run in parallel due to the
+ locks. Thus, even if they run on separate processing cores, we
+ count the time for both threads. */
+static void *
+test_producer_thread (void *v)
+{
+ Producer_Params *p = (Producer_Params *) v;
+ long iters = p->iters;
+ int filler = p->filler;
+ long j;
+
+ for (j = iters; j >= 0; --j)
+ {
+ /* Aquire lock on the queue. */
+ pthread_mutex_lock (&m);
+ /* if something's already there, wait. */
+ while (queued > 0)
+ pthread_cond_wait (&consumer_c, &m);
+
+ /* Put something on the queue */
+ FILLER_GOES_HERE;
+ ++ queued;
+ pthread_cond_signal (&producer_c);
+
+ /* Give the other thread a chance to run. */
+ pthread_mutex_unlock (&m);
+ }
+
+ return NULL;
+}
+
+static timing_t
+test_consumer_producer (long iters, int filler)
+{
+ timing_t start, stop, cur;
+ pthread_t helper_id;
+ Producer_Params p;
+
+ p.iters = iters;
+ p.filler = filler;
+
+ pthread_mutex_init (&m, NULL);
+ pthread_cond_init (&cv, NULL);
+
+ pthread_create (&helper_id, NULL, test_producer_thread, &p);
+
+ TIMING_NOW (start);
+
+ for (long j = iters; j >= 0; --j)
+ {
+ /* Aquire lock on the queue. */
+ pthread_mutex_lock (&m);
+ /* Wait for something to be on the queue. */
+ while (queued == 0)
+ pthread_cond_wait (&producer_c, &m);
+
+ /* Take if off. */
+ FILLER_GOES_HERE;
+ -- queued;
+ pthread_cond_signal (&consumer_c);
+
+ /* Give the other thread a chance to run. */
+ pthread_mutex_unlock (&m);
+ }
+
+ TIMING_NOW (stop);
+ TIMING_DIFF (cur, start, stop);
+
+
+ pthread_join (helper_id, NULL);
+ return cur;
+}
+
+/* Number of runs we use for computing mean and standard deviation.
+ We actually do two additional runs and discard the outliers. */
+#define RUN_COUNT 10
+
+static int
+do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js)
+{
+ timing_t cur;
+ struct timeval ts, te;
+ double tsd, ted, td;
+ long iters, iters_limit;
+ timing_t curs[RUN_COUNT + 2];
+ int i, j;
+ double mean, stdev;
+
+ iters = START_ITERS;
+ iters_limit = LONG_MAX / 100;
+
+ while (1) {
+ gettimeofday (&ts, NULL);
+ cur = func(iters, filler);
+ gettimeofday (&te, NULL);
+
+ /* We want a test to take at least 0.01 seconds, and try
+ increasingly larger iteration counts until it does. This
+ allows for approximately constant-time tests regardless of
+ hardware speed, without the overhead of checking the time
+ inside the test loop itself. We stop at a million iterations
+ as that should be precise enough. Once we determine a suitable
+ iteration count, we run the test multiple times to calculate
+ mean and standard deviation. */
+
+ /* Note that this also primes the CPU cache and triggers faster
+ MHz, we hope. */
+ tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
+ ted = te.tv_sec + te.tv_usec / 1000000.0;
+ td = ted - tsd;
+ if (td >= 0.01
+ || iters >= iters_limit
+ || iters >= 1000000)
+ break;
+
+ iters *= 10;
+ }
+
+ curs[0] = cur;
+ for (i = 1; i < RUN_COUNT + 2; i ++)
+ curs[i] = func(iters, filler);
+
+ /* We sort the results so we can discard the fastest and slowest
+ times as outliers. In theory we should keep the fastest time,
+ but IMHO this is more fair. A simple bubble sort suffices. */
+
+ for (i = 0; i < RUN_COUNT + 1; i ++)
+ for (j = i + 1; j < RUN_COUNT + 2; j ++)
+ if (curs[i] > curs[j])
+ {
+ timing_t temp = curs[i];
+ curs[i] = curs[j];
+ curs[j] = temp;
+ }
+
+ /* Now calculate mean and standard deviation, skipping the outliers. */
+ mean = 0.0;
+ for (i = 1; i<RUN_COUNT + 1; i ++)
+ mean += (double) curs[i] / (double) iters;
+ mean /= RUN_COUNT;
+
+ stdev = 0.0;
+ for (i = 1; i < RUN_COUNT + 1; i ++)
+ {
+ double s = (double) curs[i] / (double) iters - mean;
+ stdev += s * s;
+ }
+ stdev = sqrt (stdev / (RUN_COUNT - 1));
+
+ json_attr_object_begin (js, filler ? "filler" : "empty");
+
+ json_attr_double (js, "duration", (double) cur);
+ json_attr_double (js, "iterations", (double) iters);
+ json_attr_double (js, "wall_sec", (double) td);
+ json_attr_double (js, "mean", mean);
+ json_attr_double (js, "stdev", stdev);
+ json_attr_double (js, "min_outlier", (double) curs[0] / (double) iters);
+ json_attr_double (js, "min", (double) curs[1] / (double) iters);
+ json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters);
+ json_attr_double (js, "max_outlier", (double) curs[RUN_COUNT + 1] / (double) iters);
+
+ json_attr_object_end (js);
+
+ return 0;
+}
+
+static int
+do_bench_1 (const char *name, test_t func, json_ctx_t *js)
+{
+ int rv = 0;
+
+ json_attr_object_begin (js, name);
+
+ rv += do_bench_2 (name, func, 0, js);
+ rv += do_bench_2 (name, func, 1, js);
+
+ json_attr_object_end (js);
+
+ return rv;
+}
+
+int
+do_bench (void)
+{
+ int rv = 0;
+ json_ctx_t json_ctx;
+
+ json_init (&json_ctx, 2, stdout);
+ json_attr_object_begin (&json_ctx, "pthread_locks");
+
+#define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
+
+ BENCH (mutex);
+ BENCH (mutex_trylock);
+ BENCH (rwlock_read);
+ BENCH (rwlock_tryread);
+ BENCH (rwlock_write);
+ BENCH (rwlock_trywrite);
+ BENCH (spin_lock);
+ BENCH (spin_trylock);
+ BENCH (sem_wait);
+ BENCH (sem_trywait);
+ BENCH (condvar);
+ BENCH (consumer_producer);
+
+ json_attr_object_end (&json_ctx);
+
+ return rv;
+}
+
+
+#define TEST_FUNCTION do_bench ()
+
+#include "../test-skeleton.c"