[v2] benchtests: Add pthread-mutex-locks bench
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Benchmark for testing pthread mutex locks performance with different
threads and critical sections.
The test configuration consists of 3 parts:
1. thread number
2. critical-section length
3. non-critical-section length
Thread number starts from 1 and increased by 2x until num of CPU cores
(nprocs). An additional over-saturation case (1.25 * nprocs) is also
included.
Critical-section is represented by a loop of shared do_filler(),
length can be determined by the loop iters.
Non-critical-section is similiar to the critical-section, except it's
based on non-shared do_filler().
Currently, adaptive pthread_mutex lock is tested.
v2: Fix benchout json schema validation error.
---
benchtests/Makefile | 2 +
benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++
2 files changed, 290 insertions(+)
create mode 100644 benchtests/bench-pthread-mutex-locks.c
Comments
On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> Benchmark for testing pthread mutex locks performance with different
> threads and critical sections.
>
> The test configuration consists of 3 parts:
> 1. thread number
> 2. critical-section length
> 3. non-critical-section length
>
> Thread number starts from 1 and increased by 2x until num of CPU cores
> (nprocs). An additional over-saturation case (1.25 * nprocs) is also
> included.
> Critical-section is represented by a loop of shared do_filler(),
> length can be determined by the loop iters.
> Non-critical-section is similiar to the critical-section, except it's
> based on non-shared do_filler().
>
> Currently, adaptive pthread_mutex lock is tested.
>
> v2: Fix benchout json schema validation error.
> ---
> benchtests/Makefile | 2 +
> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++
> 2 files changed, 290 insertions(+)
> create mode 100644 benchtests/bench-pthread-mutex-locks.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 8dfca592fd..b477042e6c 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -102,6 +102,7 @@ endif
>
> bench-pthread := \
> pthread-locks \
> + pthread-mutex-locks \
> pthread_once \
> thread_create \
> # bench-pthread
> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
>
>
>
> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
> new file mode 100644
> index 0000000000..e934b0001a
> --- /dev/null
> +++ b/benchtests/bench-pthread-mutex-locks.c
> @@ -0,0 +1,288 @@
> +/* Measure mutex_lock for different threads and critical sections.
> + Copyright (C) 2020-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "pthread-mutex-locks"
> +#define TIMEOUT (20 * 60)
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <math.h>
> +#include <pthread.h>
> +#include <sys/time.h>
> +#include <sys/sysinfo.h>
> +#include "bench-timing.h"
> +#include "json-lib.h"
> +
> +static pthread_mutex_t lock;
> +static pthread_mutexattr_t attr;
> +static pthread_barrier_t barrier;
> +
> +#define START_ITERS 1000
> +
> +#pragma GCC push_options
> +#pragma GCC optimize(1)
> +
> +static int __attribute__ ((noinline)) fibonacci (int i)
> +{
> + asm("");
> + if (i > 2)
> + return fibonacci (i - 1) + fibonacci (i - 2);
> + return 10 + i;
> +}
> +
> +static void
> +do_filler (void)
> +{
> + char buf1[512], buf2[512];
> + int f = fibonacci (4);
> + memcpy (buf1, buf2, f);
> +}
> +
> +static void
> +do_filler_shared (void)
> +{
> + static char buf1[512], buf2[512];
> + int f = fibonacci (4);
> + memcpy (buf1, buf2, f);
> +}
> +
> +#pragma GCC pop_options
> +
> +#define UNIT_WORK_CRT do_filler_shared ()
> +#define UNIT_WORK_NON_CRT do_filler ()
> +
> +static inline void
> +critical_section (int length)
> +{
> + for (int i = length; i >= 0; i--)
> + UNIT_WORK_CRT;
> +}
> +
> +static inline void
> +non_critical_section (int length)
> +{
> + for (int i = length; i >= 0; i--)
> + UNIT_WORK_NON_CRT;
> +}
> +
> +typedef struct Worker_Params
> +{
> + long iters;
> + int crt_len;
> + int non_crt_len;
> + timing_t duration;
> +} Worker_Params;
> +
> +static void *
> +worker (void *v)
> +{
> + timing_t start, stop;
> + Worker_Params *p = (Worker_Params *) v;
> + long iters = p->iters;
> + int crt_len = p->crt_len;
> + int non_crt_len = p->non_crt_len;
> +
> + pthread_barrier_wait (&barrier);
> + TIMING_NOW (start);
> + while (iters--)
> + {
> + pthread_mutex_lock (&lock);
> + critical_section (crt_len);
> + pthread_mutex_unlock (&lock);
> + non_critical_section (non_crt_len);
> + }
> + TIMING_NOW (stop);
> +
> + TIMING_DIFF (p->duration, start, stop);
> + return NULL;
> +}
> +
> +static double
> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
> +{
> + int i;
> + timing_t mean;
> + Worker_Params *p, params[num_threads];
> + pthread_t threads[num_threads];
> +
> + pthread_mutex_init (&lock, &attr);
> + pthread_barrier_init (&barrier, NULL, num_threads);
> +
> + for (i = 0; i < num_threads; i++)
> + {
> + p = ¶ms[i];
> + p->iters = iters;
> + p->crt_len = crt_len;
> + p->non_crt_len = non_crt_len;
> + pthread_create (&threads[i], NULL, worker, (void *) p);
> + }
> + for (i = 0; i < num_threads; i++)
> + pthread_join (threads[i], NULL);
> +
> + pthread_mutex_destroy (&lock);
> + pthread_barrier_destroy (&barrier);
> +
> + mean = 0;
> + for (i = 0; i < num_threads; i++)
> + mean += params[i].duration;
> + mean /= num_threads;
> + return mean;
> +}
> +
> +#define RUN_COUNT 10
> +#define MIN_TEST_SEC 0.01
> +
> +static void
> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len,
> + json_ctx_t *js)
> +{
> + timing_t cur;
> + struct timeval ts, te;
> + double tsd, ted, td;
> + long iters, iters_limit, total_iters;
> + timing_t curs[RUN_COUNT + 2];
> + int i, j;
> + double mean, stdev;
> +
> + iters = START_ITERS;
> + iters_limit = LONG_MAX / 100;
> +
> + while (1)
> + {
> + gettimeofday (&ts, NULL);
> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
> + gettimeofday (&te, NULL);
> + /* Make sure the test to run at least MIN_TEST_SEC. */
> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
> + ted = te.tv_sec + te.tv_usec / 1000000.0;
> + td = ted - tsd;
> + if (td >= MIN_TEST_SEC || iters >= iters_limit)
> + break;
> +
> + iters *= 10;
> + }
> +
> + curs[0] = cur;
> + for (i = 1; i < RUN_COUNT + 2; i++)
> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
> +
> + /* Sort the results so we can discard the fastest and slowest
> + times as outliers. */
> + for (i = 0; i < RUN_COUNT + 1; i++)
> + for (j = i + 1; j < RUN_COUNT + 2; j++)
> + if (curs[i] > curs[j])
> + {
> + timing_t temp = curs[i];
> + curs[i] = curs[j];
> + curs[j] = temp;
> + }
> +
> + /* Calculate mean and standard deviation. */
> + mean = 0.0;
> + total_iters = iters * num_threads;
> + for (i = 1; i < RUN_COUNT + 1; i++)
> + mean += (double) curs[i] / (double) total_iters;
> + mean /= RUN_COUNT;
> +
> + stdev = 0.0;
> + for (i = 1; i < RUN_COUNT + 1; i++)
> + {
> + double s = (double) curs[i] / (double) total_iters - mean;
> + stdev += s * s;
> + }
> + stdev = sqrt (stdev / (RUN_COUNT - 1));
> +
> + char buf[256];
> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name,
> + non_crt_len, crt_len, num_threads);
> +
> + json_attr_object_begin (js, buf);
> +
> + json_attr_double (js, "duration", (double) cur);
> + json_attr_double (js, "iterations", (double) total_iters);
> + json_attr_double (js, "mean", mean);
> + json_attr_double (js, "stdev", stdev);
> + json_attr_double (js, "min-outlier",
> + (double) curs[0] / (double) total_iters);
> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
> + json_attr_double (js, "max",
> + (double) curs[RUN_COUNT] / (double) total_iters);
> + json_attr_double (js, "max-outlier",
> + (double) curs[RUN_COUNT + 1] / (double) total_iters);
> +
> + json_attr_object_end (js);
> +}
> +
> +#define TH_CONF_MAX 10
> +
> +int
> +do_bench (void)
> +{
> + int rv = 0;
> + json_ctx_t json_ctx;
> + int i, j, k;
> + int th_num, th_conf, nprocs;
> + int threads[TH_CONF_MAX];
> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
> + int non_crt_lens[] = { 1, 32, 128 };
> + char name[128];
> +
> + json_init (&json_ctx, 2, stdout);
> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks");
> +
> + /* The thread config begins from 1, and increases by 2x until nprocs.
> + We also wants to test over-saturation case (1.25*nprocs). */
> + nprocs = get_nprocs ();
> + th_num = 1;
> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
> + {
> + threads[th_conf] = th_num;
> + th_num <<= 1;
> + }
> + threads[th_conf++] = nprocs;
> + threads[th_conf++] = nprocs + nprocs / 4;
> +
> + pthread_mutexattr_init (&attr);
> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> + snprintf (name, sizeof name, "type=adaptive");
> +
> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
> + {
> + int non_crt_len = non_crt_lens[k];
> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
> + {
> + int crt_len = crt_lens[j];
> + for (i = 0; i < th_conf; i++)
> + {
> + th_num = threads[i];
> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx);
> + }
> + }
> + }
> +
> + json_attr_object_end (&json_ctx);
> +
> + return rv;
> +}
> +
> +#define TEST_FUNCTION do_bench ()
> +
> +#include "../test-skeleton.c"
> --
> 2.35.1
>
Can you run clang-format on this? Otherwise
LGTM.
On 4/21/2022 9:13 PM, Noah Goldstein via Libc-alpha wrote:
> On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
>>
>> Benchmark for testing pthread mutex locks performance with different
>> threads and critical sections.
>>
>> The test configuration consists of 3 parts:
>> 1. thread number
>> 2. critical-section length
>> 3. non-critical-section length
>>
>> Thread number starts from 1 and increased by 2x until num of CPU cores
>> (nprocs). An additional over-saturation case (1.25 * nprocs) is also
>> included.
>> Critical-section is represented by a loop of shared do_filler(),
>> length can be determined by the loop iters.
>> Non-critical-section is similiar to the critical-section, except it's
>> based on non-shared do_filler().
>>
>> Currently, adaptive pthread_mutex lock is tested.
>>
>> v2: Fix benchout json schema validation error.
>> ---
>> benchtests/Makefile | 2 +
>> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++
>> 2 files changed, 290 insertions(+)
>> create mode 100644 benchtests/bench-pthread-mutex-locks.c
>>
>> diff --git a/benchtests/Makefile b/benchtests/Makefile
>> index 8dfca592fd..b477042e6c 100644
>> --- a/benchtests/Makefile
>> +++ b/benchtests/Makefile
>> @@ -102,6 +102,7 @@ endif
>>
>> bench-pthread := \
>> pthread-locks \
>> + pthread-mutex-locks \
>> pthread_once \
>> thread_create \
>> # bench-pthread
>> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
>> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
>> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
>> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
>> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
>>
>>
>>
>> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
>> new file mode 100644
>> index 0000000000..e934b0001a
>> --- /dev/null
>> +++ b/benchtests/bench-pthread-mutex-locks.c
>> @@ -0,0 +1,288 @@
>> +/* Measure mutex_lock for different threads and critical sections.
>> + Copyright (C) 2020-2022 Free Software Foundation, Inc.
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#define TEST_MAIN
>> +#define TEST_NAME "pthread-mutex-locks"
>> +#define TIMEOUT (20 * 60)
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include <math.h>
>> +#include <pthread.h>
>> +#include <sys/time.h>
>> +#include <sys/sysinfo.h>
>> +#include "bench-timing.h"
>> +#include "json-lib.h"
>> +
>> +static pthread_mutex_t lock;
>> +static pthread_mutexattr_t attr;
>> +static pthread_barrier_t barrier;
>> +
>> +#define START_ITERS 1000
>> +
>> +#pragma GCC push_options
>> +#pragma GCC optimize(1)
>> +
>> +static int __attribute__ ((noinline)) fibonacci (int i)
>> +{
>> + asm("");
>> + if (i > 2)
>> + return fibonacci (i - 1) + fibonacci (i - 2);
>> + return 10 + i;
>> +}
>> +
>> +static void
>> +do_filler (void)
>> +{
>> + char buf1[512], buf2[512];
>> + int f = fibonacci (4);
>> + memcpy (buf1, buf2, f);
>> +}
>> +
>> +static void
>> +do_filler_shared (void)
>> +{
>> + static char buf1[512], buf2[512];
>> + int f = fibonacci (4);
>> + memcpy (buf1, buf2, f);
>> +}
>> +
>> +#pragma GCC pop_options
>> +
>> +#define UNIT_WORK_CRT do_filler_shared ()
>> +#define UNIT_WORK_NON_CRT do_filler ()
>> +
>> +static inline void
>> +critical_section (int length)
>> +{
>> + for (int i = length; i >= 0; i--)
>> + UNIT_WORK_CRT;
>> +}
>> +
>> +static inline void
>> +non_critical_section (int length)
>> +{
>> + for (int i = length; i >= 0; i--)
>> + UNIT_WORK_NON_CRT;
>> +}
>> +
>> +typedef struct Worker_Params
>> +{
>> + long iters;
>> + int crt_len;
>> + int non_crt_len;
>> + timing_t duration;
>> +} Worker_Params;
>> +
>> +static void *
>> +worker (void *v)
>> +{
>> + timing_t start, stop;
>> + Worker_Params *p = (Worker_Params *) v;
>> + long iters = p->iters;
>> + int crt_len = p->crt_len;
>> + int non_crt_len = p->non_crt_len;
>> +
>> + pthread_barrier_wait (&barrier);
>> + TIMING_NOW (start);
>> + while (iters--)
>> + {
>> + pthread_mutex_lock (&lock);
>> + critical_section (crt_len);
>> + pthread_mutex_unlock (&lock);
>> + non_critical_section (non_crt_len);
>> + }
>> + TIMING_NOW (stop);
>> +
>> + TIMING_DIFF (p->duration, start, stop);
>> + return NULL;
>> +}
>> +
>> +static double
>> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
>> +{
>> + int i;
>> + timing_t mean;
>> + Worker_Params *p, params[num_threads];
>> + pthread_t threads[num_threads];
>> +
>> + pthread_mutex_init (&lock, &attr);
>> + pthread_barrier_init (&barrier, NULL, num_threads);
>> +
>> + for (i = 0; i < num_threads; i++)
>> + {
>> + p = ¶ms[i];
>> + p->iters = iters;
>> + p->crt_len = crt_len;
>> + p->non_crt_len = non_crt_len;
>> + pthread_create (&threads[i], NULL, worker, (void *) p);
>> + }
>> + for (i = 0; i < num_threads; i++)
>> + pthread_join (threads[i], NULL);
>> +
>> + pthread_mutex_destroy (&lock);
>> + pthread_barrier_destroy (&barrier);
>> +
>> + mean = 0;
>> + for (i = 0; i < num_threads; i++)
>> + mean += params[i].duration;
>> + mean /= num_threads;
>> + return mean;
>> +}
>> +
>> +#define RUN_COUNT 10
>> +#define MIN_TEST_SEC 0.01
>> +
>> +static void
>> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len,
>> + json_ctx_t *js)
>> +{
>> + timing_t cur;
>> + struct timeval ts, te;
>> + double tsd, ted, td;
>> + long iters, iters_limit, total_iters;
>> + timing_t curs[RUN_COUNT + 2];
>> + int i, j;
>> + double mean, stdev;
>> +
>> + iters = START_ITERS;
>> + iters_limit = LONG_MAX / 100;
>> +
>> + while (1)
>> + {
>> + gettimeofday (&ts, NULL);
>> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
>> + gettimeofday (&te, NULL);
>> + /* Make sure the test to run at least MIN_TEST_SEC. */
>> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
>> + ted = te.tv_sec + te.tv_usec / 1000000.0;
>> + td = ted - tsd;
>> + if (td >= MIN_TEST_SEC || iters >= iters_limit)
>> + break;
>> +
>> + iters *= 10;
>> + }
>> +
>> + curs[0] = cur;
>> + for (i = 1; i < RUN_COUNT + 2; i++)
>> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
>> +
>> + /* Sort the results so we can discard the fastest and slowest
>> + times as outliers. */
>> + for (i = 0; i < RUN_COUNT + 1; i++)
>> + for (j = i + 1; j < RUN_COUNT + 2; j++)
>> + if (curs[i] > curs[j])
>> + {
>> + timing_t temp = curs[i];
>> + curs[i] = curs[j];
>> + curs[j] = temp;
>> + }
>> +
>> + /* Calculate mean and standard deviation. */
>> + mean = 0.0;
>> + total_iters = iters * num_threads;
>> + for (i = 1; i < RUN_COUNT + 1; i++)
>> + mean += (double) curs[i] / (double) total_iters;
>> + mean /= RUN_COUNT;
>> +
>> + stdev = 0.0;
>> + for (i = 1; i < RUN_COUNT + 1; i++)
>> + {
>> + double s = (double) curs[i] / (double) total_iters - mean;
>> + stdev += s * s;
>> + }
>> + stdev = sqrt (stdev / (RUN_COUNT - 1));
>> +
>> + char buf[256];
>> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name,
>> + non_crt_len, crt_len, num_threads);
>> +
>> + json_attr_object_begin (js, buf);
>> +
>> + json_attr_double (js, "duration", (double) cur);
>> + json_attr_double (js, "iterations", (double) total_iters);
>> + json_attr_double (js, "mean", mean);
>> + json_attr_double (js, "stdev", stdev);
>> + json_attr_double (js, "min-outlier",
>> + (double) curs[0] / (double) total_iters);
>> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
>> + json_attr_double (js, "max",
>> + (double) curs[RUN_COUNT] / (double) total_iters);
>> + json_attr_double (js, "max-outlier",
>> + (double) curs[RUN_COUNT + 1] / (double) total_iters);
>> +
>> + json_attr_object_end (js);
>> +}
>> +
>> +#define TH_CONF_MAX 10
>> +
>> +int
>> +do_bench (void)
>> +{
>> + int rv = 0;
>> + json_ctx_t json_ctx;
>> + int i, j, k;
>> + int th_num, th_conf, nprocs;
>> + int threads[TH_CONF_MAX];
>> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
>> + int non_crt_lens[] = { 1, 32, 128 };
>> + char name[128];
>> +
>> + json_init (&json_ctx, 2, stdout);
>> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks");
>> +
>> + /* The thread config begins from 1, and increases by 2x until nprocs.
>> + We also wants to test over-saturation case (1.25*nprocs). */
>> + nprocs = get_nprocs ();
>> + th_num = 1;
>> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
>> + {
>> + threads[th_conf] = th_num;
>> + th_num <<= 1;
>> + }
>> + threads[th_conf++] = nprocs;
>> + threads[th_conf++] = nprocs + nprocs / 4;
>> +
>> + pthread_mutexattr_init (&attr);
>> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
>> + snprintf (name, sizeof name, "type=adaptive");
>> +
>> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
>> + {
>> + int non_crt_len = non_crt_lens[k];
>> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
>> + {
>> + int crt_len = crt_lens[j];
>> + for (i = 0; i < th_conf; i++)
>> + {
>> + th_num = threads[i];
>> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx);
>> + }
>> + }
>> + }
>> +
>> + json_attr_object_end (&json_ctx);
>> +
>> + return rv;
>> +}
>> +
>> +#define TEST_FUNCTION do_bench ()
>> +
>> +#include "../test-skeleton.c"
>> --
>> 2.35.1
>>
>
> Can you run clang-format on this? Otherwise
> LGTM.
>
clang-format done.
Nothing needs to change for this patch.
On Thu, Apr 21, 2022 at 5:58 PM Guo, Wangyang <wangyang.guo@intel.com> wrote:
>
> On 4/21/2022 9:13 PM, Noah Goldstein via Libc-alpha wrote:
> > On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
> >>
> >> Benchmark for testing pthread mutex locks performance with different
> >> threads and critical sections.
> >>
> >> The test configuration consists of 3 parts:
> >> 1. thread number
> >> 2. critical-section length
> >> 3. non-critical-section length
> >>
> >> Thread number starts from 1 and increased by 2x until num of CPU cores
> >> (nprocs). An additional over-saturation case (1.25 * nprocs) is also
> >> included.
> >> Critical-section is represented by a loop of shared do_filler(),
> >> length can be determined by the loop iters.
> >> Non-critical-section is similiar to the critical-section, except it's
> >> based on non-shared do_filler().
> >>
> >> Currently, adaptive pthread_mutex lock is tested.
> >>
> >> v2: Fix benchout json schema validation error.
> >> ---
> >> benchtests/Makefile | 2 +
> >> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++
> >> 2 files changed, 290 insertions(+)
> >> create mode 100644 benchtests/bench-pthread-mutex-locks.c
> >>
> >> diff --git a/benchtests/Makefile b/benchtests/Makefile
> >> index 8dfca592fd..b477042e6c 100644
> >> --- a/benchtests/Makefile
> >> +++ b/benchtests/Makefile
> >> @@ -102,6 +102,7 @@ endif
> >>
> >> bench-pthread := \
> >> pthread-locks \
> >> + pthread-mutex-locks \
> >> pthread_once \
> >> thread_create \
> >> # bench-pthread
> >> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
> >> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
> >> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
> >> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
> >> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
> >>
> >>
> >>
> >> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
> >> new file mode 100644
> >> index 0000000000..e934b0001a
> >> --- /dev/null
> >> +++ b/benchtests/bench-pthread-mutex-locks.c
> >> @@ -0,0 +1,288 @@
> >> +/* Measure mutex_lock for different threads and critical sections.
> >> + Copyright (C) 2020-2022 Free Software Foundation, Inc.
> >> + This file is part of the GNU C Library.
> >> +
> >> + The GNU C Library is free software; you can redistribute it and/or
> >> + modify it under the terms of the GNU Lesser General Public
> >> + License as published by the Free Software Foundation; either
> >> + version 2.1 of the License, or (at your option) any later version.
> >> +
> >> + The GNU C Library is distributed in the hope that it will be useful,
> >> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> >> + Lesser General Public License for more details.
> >> +
> >> + You should have received a copy of the GNU Lesser General Public
> >> + License along with the GNU C Library; if not, see
> >> + <https://www.gnu.org/licenses/>. */
> >> +
> >> +#define TEST_MAIN
> >> +#define TEST_NAME "pthread-mutex-locks"
> >> +#define TIMEOUT (20 * 60)
> >> +
> >> +#include <stdio.h>
> >> +#include <stdlib.h>
> >> +#include <string.h>
> >> +#include <unistd.h>
> >> +#include <math.h>
> >> +#include <pthread.h>
> >> +#include <sys/time.h>
> >> +#include <sys/sysinfo.h>
> >> +#include "bench-timing.h"
> >> +#include "json-lib.h"
> >> +
> >> +static pthread_mutex_t lock;
> >> +static pthread_mutexattr_t attr;
> >> +static pthread_barrier_t barrier;
> >> +
> >> +#define START_ITERS 1000
> >> +
> >> +#pragma GCC push_options
> >> +#pragma GCC optimize(1)
> >> +
> >> +static int __attribute__ ((noinline)) fibonacci (int i)
> >> +{
> >> + asm("");
> >> + if (i > 2)
> >> + return fibonacci (i - 1) + fibonacci (i - 2);
> >> + return 10 + i;
> >> +}
> >> +
> >> +static void
> >> +do_filler (void)
> >> +{
> >> + char buf1[512], buf2[512];
> >> + int f = fibonacci (4);
> >> + memcpy (buf1, buf2, f);
> >> +}
> >> +
> >> +static void
> >> +do_filler_shared (void)
> >> +{
> >> + static char buf1[512], buf2[512];
> >> + int f = fibonacci (4);
> >> + memcpy (buf1, buf2, f);
> >> +}
> >> +
> >> +#pragma GCC pop_options
> >> +
> >> +#define UNIT_WORK_CRT do_filler_shared ()
> >> +#define UNIT_WORK_NON_CRT do_filler ()
> >> +
> >> +static inline void
> >> +critical_section (int length)
> >> +{
> >> + for (int i = length; i >= 0; i--)
> >> + UNIT_WORK_CRT;
> >> +}
> >> +
> >> +static inline void
> >> +non_critical_section (int length)
> >> +{
> >> + for (int i = length; i >= 0; i--)
> >> + UNIT_WORK_NON_CRT;
> >> +}
> >> +
> >> +typedef struct Worker_Params
> >> +{
> >> + long iters;
> >> + int crt_len;
> >> + int non_crt_len;
> >> + timing_t duration;
> >> +} Worker_Params;
> >> +
> >> +static void *
> >> +worker (void *v)
> >> +{
> >> + timing_t start, stop;
> >> + Worker_Params *p = (Worker_Params *) v;
> >> + long iters = p->iters;
> >> + int crt_len = p->crt_len;
> >> + int non_crt_len = p->non_crt_len;
> >> +
> >> + pthread_barrier_wait (&barrier);
> >> + TIMING_NOW (start);
> >> + while (iters--)
> >> + {
> >> + pthread_mutex_lock (&lock);
> >> + critical_section (crt_len);
> >> + pthread_mutex_unlock (&lock);
> >> + non_critical_section (non_crt_len);
> >> + }
> >> + TIMING_NOW (stop);
> >> +
> >> + TIMING_DIFF (p->duration, start, stop);
> >> + return NULL;
> >> +}
> >> +
> >> +static double
> >> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
> >> +{
> >> + int i;
> >> + timing_t mean;
> >> + Worker_Params *p, params[num_threads];
> >> + pthread_t threads[num_threads];
> >> +
> >> + pthread_mutex_init (&lock, &attr);
> >> + pthread_barrier_init (&barrier, NULL, num_threads);
> >> +
> >> + for (i = 0; i < num_threads; i++)
> >> + {
> >> + p = ¶ms[i];
> >> + p->iters = iters;
> >> + p->crt_len = crt_len;
> >> + p->non_crt_len = non_crt_len;
> >> + pthread_create (&threads[i], NULL, worker, (void *) p);
> >> + }
> >> + for (i = 0; i < num_threads; i++)
> >> + pthread_join (threads[i], NULL);
> >> +
> >> + pthread_mutex_destroy (&lock);
> >> + pthread_barrier_destroy (&barrier);
> >> +
> >> + mean = 0;
> >> + for (i = 0; i < num_threads; i++)
> >> + mean += params[i].duration;
> >> + mean /= num_threads;
> >> + return mean;
> >> +}
> >> +
> >> +#define RUN_COUNT 10
> >> +#define MIN_TEST_SEC 0.01
> >> +
> >> +static void
> >> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len,
> >> + json_ctx_t *js)
> >> +{
> >> + timing_t cur;
> >> + struct timeval ts, te;
> >> + double tsd, ted, td;
> >> + long iters, iters_limit, total_iters;
> >> + timing_t curs[RUN_COUNT + 2];
> >> + int i, j;
> >> + double mean, stdev;
> >> +
> >> + iters = START_ITERS;
> >> + iters_limit = LONG_MAX / 100;
> >> +
> >> + while (1)
> >> + {
> >> + gettimeofday (&ts, NULL);
> >> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
> >> + gettimeofday (&te, NULL);
> >> + /* Make sure the test to run at least MIN_TEST_SEC. */
> >> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
> >> + ted = te.tv_sec + te.tv_usec / 1000000.0;
> >> + td = ted - tsd;
> >> + if (td >= MIN_TEST_SEC || iters >= iters_limit)
> >> + break;
> >> +
> >> + iters *= 10;
> >> + }
> >> +
> >> + curs[0] = cur;
> >> + for (i = 1; i < RUN_COUNT + 2; i++)
> >> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
> >> +
> >> + /* Sort the results so we can discard the fastest and slowest
> >> + times as outliers. */
> >> + for (i = 0; i < RUN_COUNT + 1; i++)
> >> + for (j = i + 1; j < RUN_COUNT + 2; j++)
> >> + if (curs[i] > curs[j])
> >> + {
> >> + timing_t temp = curs[i];
> >> + curs[i] = curs[j];
> >> + curs[j] = temp;
> >> + }
> >> +
> >> + /* Calculate mean and standard deviation. */
> >> + mean = 0.0;
> >> + total_iters = iters * num_threads;
> >> + for (i = 1; i < RUN_COUNT + 1; i++)
> >> + mean += (double) curs[i] / (double) total_iters;
> >> + mean /= RUN_COUNT;
> >> +
> >> + stdev = 0.0;
> >> + for (i = 1; i < RUN_COUNT + 1; i++)
> >> + {
> >> + double s = (double) curs[i] / (double) total_iters - mean;
> >> + stdev += s * s;
> >> + }
> >> + stdev = sqrt (stdev / (RUN_COUNT - 1));
> >> +
> >> + char buf[256];
> >> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name,
> >> + non_crt_len, crt_len, num_threads);
> >> +
> >> + json_attr_object_begin (js, buf);
> >> +
> >> + json_attr_double (js, "duration", (double) cur);
> >> + json_attr_double (js, "iterations", (double) total_iters);
> >> + json_attr_double (js, "mean", mean);
> >> + json_attr_double (js, "stdev", stdev);
> >> + json_attr_double (js, "min-outlier",
> >> + (double) curs[0] / (double) total_iters);
> >> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
> >> + json_attr_double (js, "max",
> >> + (double) curs[RUN_COUNT] / (double) total_iters);
> >> + json_attr_double (js, "max-outlier",
> >> + (double) curs[RUN_COUNT + 1] / (double) total_iters);
> >> +
> >> + json_attr_object_end (js);
> >> +}
> >> +
> >> +#define TH_CONF_MAX 10
> >> +
> >> +int
> >> +do_bench (void)
> >> +{
> >> + int rv = 0;
> >> + json_ctx_t json_ctx;
> >> + int i, j, k;
> >> + int th_num, th_conf, nprocs;
> >> + int threads[TH_CONF_MAX];
> >> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
> >> + int non_crt_lens[] = { 1, 32, 128 };
> >> + char name[128];
> >> +
> >> + json_init (&json_ctx, 2, stdout);
> >> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks");
> >> +
> >> + /* The thread config begins from 1, and increases by 2x until nprocs.
> >> + We also wants to test over-saturation case (1.25*nprocs). */
> >> + nprocs = get_nprocs ();
> >> + th_num = 1;
> >> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
> >> + {
> >> + threads[th_conf] = th_num;
> >> + th_num <<= 1;
> >> + }
> >> + threads[th_conf++] = nprocs;
> >> + threads[th_conf++] = nprocs + nprocs / 4;
> >> +
> >> + pthread_mutexattr_init (&attr);
> >> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> >> + snprintf (name, sizeof name, "type=adaptive");
> >> +
> >> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
> >> + {
> >> + int non_crt_len = non_crt_lens[k];
> >> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
> >> + {
> >> + int crt_len = crt_lens[j];
> >> + for (i = 0; i < th_conf; i++)
> >> + {
> >> + th_num = threads[i];
> >> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx);
> >> + }
> >> + }
> >> + }
> >> +
> >> + json_attr_object_end (&json_ctx);
> >> +
> >> + return rv;
> >> +}
> >> +
> >> +#define TEST_FUNCTION do_bench ()
> >> +
> >> +#include "../test-skeleton.c"
> >> --
> >> 2.35.1
> >>
> >
> > Can you run clang-format on this? Otherwise
> > LGTM.
> >
>
> clang-format done.
> Nothing needs to change for this patch.
Woops.
LGTM.
On Sat, Apr 23, 2022 at 8:04 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Apr 21, 2022 at 5:58 PM Guo, Wangyang <wangyang.guo@intel.com> wrote:
> >
> > On 4/21/2022 9:13 PM, Noah Goldstein via Libc-alpha wrote:
> > > On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote:
> > >>
> > >> Benchmark for testing pthread mutex locks performance with different
> > >> threads and critical sections.
> > >>
> > >> The test configuration consists of 3 parts:
> > >> 1. thread number
> > >> 2. critical-section length
> > >> 3. non-critical-section length
> > >>
> > >> Thread number starts from 1 and increased by 2x until num of CPU cores
> > >> (nprocs). An additional over-saturation case (1.25 * nprocs) is also
> > >> included.
> > >> Critical-section is represented by a loop of shared do_filler(),
> > >> length can be determined by the loop iters.
> > >> Non-critical-section is similiar to the critical-section, except it's
> > >> based on non-shared do_filler().
> > >>
> > >> Currently, adaptive pthread_mutex lock is tested.
> > >>
> > >> v2: Fix benchout json schema validation error.
> > >> ---
> > >> benchtests/Makefile | 2 +
> > >> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++
> > >> 2 files changed, 290 insertions(+)
> > >> create mode 100644 benchtests/bench-pthread-mutex-locks.c
> > >>
> > >> diff --git a/benchtests/Makefile b/benchtests/Makefile
> > >> index 8dfca592fd..b477042e6c 100644
> > >> --- a/benchtests/Makefile
> > >> +++ b/benchtests/Makefile
> > >> @@ -102,6 +102,7 @@ endif
> > >>
> > >> bench-pthread := \
> > >> pthread-locks \
> > >> + pthread-mutex-locks \
> > >> pthread_once \
> > >> thread_create \
> > >> # bench-pthread
> > >> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
> > >> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
> > >> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
> > >> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
> > >> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
> > >>
> > >>
> > >>
> > >> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
> > >> new file mode 100644
> > >> index 0000000000..e934b0001a
> > >> --- /dev/null
> > >> +++ b/benchtests/bench-pthread-mutex-locks.c
> > >> @@ -0,0 +1,288 @@
> > >> +/* Measure mutex_lock for different threads and critical sections.
> > >> + Copyright (C) 2020-2022 Free Software Foundation, Inc.
> > >> + This file is part of the GNU C Library.
> > >> +
> > >> + The GNU C Library is free software; you can redistribute it and/or
> > >> + modify it under the terms of the GNU Lesser General Public
> > >> + License as published by the Free Software Foundation; either
> > >> + version 2.1 of the License, or (at your option) any later version.
> > >> +
> > >> + The GNU C Library is distributed in the hope that it will be useful,
> > >> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > >> + Lesser General Public License for more details.
> > >> +
> > >> + You should have received a copy of the GNU Lesser General Public
> > >> + License along with the GNU C Library; if not, see
> > >> + <https://www.gnu.org/licenses/>. */
> > >> +
> > >> +#define TEST_MAIN
> > >> +#define TEST_NAME "pthread-mutex-locks"
> > >> +#define TIMEOUT (20 * 60)
> > >> +
> > >> +#include <stdio.h>
> > >> +#include <stdlib.h>
> > >> +#include <string.h>
> > >> +#include <unistd.h>
> > >> +#include <math.h>
> > >> +#include <pthread.h>
> > >> +#include <sys/time.h>
> > >> +#include <sys/sysinfo.h>
> > >> +#include "bench-timing.h"
> > >> +#include "json-lib.h"
> > >> +
> > >> +static pthread_mutex_t lock;
> > >> +static pthread_mutexattr_t attr;
> > >> +static pthread_barrier_t barrier;
> > >> +
> > >> +#define START_ITERS 1000
> > >> +
> > >> +#pragma GCC push_options
> > >> +#pragma GCC optimize(1)
> > >> +
> > >> +static int __attribute__ ((noinline)) fibonacci (int i)
> > >> +{
> > >> + asm("");
> > >> + if (i > 2)
> > >> + return fibonacci (i - 1) + fibonacci (i - 2);
> > >> + return 10 + i;
> > >> +}
> > >> +
> > >> +static void
> > >> +do_filler (void)
> > >> +{
> > >> + char buf1[512], buf2[512];
> > >> + int f = fibonacci (4);
> > >> + memcpy (buf1, buf2, f);
> > >> +}
> > >> +
> > >> +static void
> > >> +do_filler_shared (void)
> > >> +{
> > >> + static char buf1[512], buf2[512];
> > >> + int f = fibonacci (4);
> > >> + memcpy (buf1, buf2, f);
> > >> +}
> > >> +
> > >> +#pragma GCC pop_options
> > >> +
> > >> +#define UNIT_WORK_CRT do_filler_shared ()
> > >> +#define UNIT_WORK_NON_CRT do_filler ()
> > >> +
> > >> +static inline void
> > >> +critical_section (int length)
> > >> +{
> > >> + for (int i = length; i >= 0; i--)
> > >> + UNIT_WORK_CRT;
> > >> +}
> > >> +
> > >> +static inline void
> > >> +non_critical_section (int length)
> > >> +{
> > >> + for (int i = length; i >= 0; i--)
> > >> + UNIT_WORK_NON_CRT;
> > >> +}
> > >> +
> > >> +typedef struct Worker_Params
> > >> +{
> > >> + long iters;
> > >> + int crt_len;
> > >> + int non_crt_len;
> > >> + timing_t duration;
> > >> +} Worker_Params;
> > >> +
> > >> +static void *
> > >> +worker (void *v)
> > >> +{
> > >> + timing_t start, stop;
> > >> + Worker_Params *p = (Worker_Params *) v;
> > >> + long iters = p->iters;
> > >> + int crt_len = p->crt_len;
> > >> + int non_crt_len = p->non_crt_len;
> > >> +
> > >> + pthread_barrier_wait (&barrier);
> > >> + TIMING_NOW (start);
> > >> + while (iters--)
> > >> + {
> > >> + pthread_mutex_lock (&lock);
> > >> + critical_section (crt_len);
> > >> + pthread_mutex_unlock (&lock);
> > >> + non_critical_section (non_crt_len);
> > >> + }
> > >> + TIMING_NOW (stop);
> > >> +
> > >> + TIMING_DIFF (p->duration, start, stop);
> > >> + return NULL;
> > >> +}
> > >> +
> > >> +static double
> > >> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
> > >> +{
> > >> + int i;
> > >> + timing_t mean;
> > >> + Worker_Params *p, params[num_threads];
> > >> + pthread_t threads[num_threads];
> > >> +
> > >> + pthread_mutex_init (&lock, &attr);
> > >> + pthread_barrier_init (&barrier, NULL, num_threads);
> > >> +
> > >> + for (i = 0; i < num_threads; i++)
> > >> + {
> > >> + p = ¶ms[i];
> > >> + p->iters = iters;
> > >> + p->crt_len = crt_len;
> > >> + p->non_crt_len = non_crt_len;
> > >> + pthread_create (&threads[i], NULL, worker, (void *) p);
> > >> + }
> > >> + for (i = 0; i < num_threads; i++)
> > >> + pthread_join (threads[i], NULL);
> > >> +
> > >> + pthread_mutex_destroy (&lock);
> > >> + pthread_barrier_destroy (&barrier);
> > >> +
> > >> + mean = 0;
> > >> + for (i = 0; i < num_threads; i++)
> > >> + mean += params[i].duration;
> > >> + mean /= num_threads;
> > >> + return mean;
> > >> +}
> > >> +
> > >> +#define RUN_COUNT 10
> > >> +#define MIN_TEST_SEC 0.01
> > >> +
> > >> +static void
> > >> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len,
> > >> + json_ctx_t *js)
> > >> +{
> > >> + timing_t cur;
> > >> + struct timeval ts, te;
> > >> + double tsd, ted, td;
> > >> + long iters, iters_limit, total_iters;
> > >> + timing_t curs[RUN_COUNT + 2];
> > >> + int i, j;
> > >> + double mean, stdev;
> > >> +
> > >> + iters = START_ITERS;
> > >> + iters_limit = LONG_MAX / 100;
> > >> +
> > >> + while (1)
> > >> + {
> > >> + gettimeofday (&ts, NULL);
> > >> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
> > >> + gettimeofday (&te, NULL);
> > >> + /* Make sure the test to run at least MIN_TEST_SEC. */
> > >> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
> > >> + ted = te.tv_sec + te.tv_usec / 1000000.0;
> > >> + td = ted - tsd;
> > >> + if (td >= MIN_TEST_SEC || iters >= iters_limit)
> > >> + break;
> > >> +
> > >> + iters *= 10;
> > >> + }
> > >> +
> > >> + curs[0] = cur;
> > >> + for (i = 1; i < RUN_COUNT + 2; i++)
> > >> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
> > >> +
> > >> + /* Sort the results so we can discard the fastest and slowest
> > >> + times as outliers. */
> > >> + for (i = 0; i < RUN_COUNT + 1; i++)
> > >> + for (j = i + 1; j < RUN_COUNT + 2; j++)
> > >> + if (curs[i] > curs[j])
> > >> + {
> > >> + timing_t temp = curs[i];
> > >> + curs[i] = curs[j];
> > >> + curs[j] = temp;
> > >> + }
> > >> +
> > >> + /* Calculate mean and standard deviation. */
> > >> + mean = 0.0;
> > >> + total_iters = iters * num_threads;
> > >> + for (i = 1; i < RUN_COUNT + 1; i++)
> > >> + mean += (double) curs[i] / (double) total_iters;
> > >> + mean /= RUN_COUNT;
> > >> +
> > >> + stdev = 0.0;
> > >> + for (i = 1; i < RUN_COUNT + 1; i++)
> > >> + {
> > >> + double s = (double) curs[i] / (double) total_iters - mean;
> > >> + stdev += s * s;
> > >> + }
> > >> + stdev = sqrt (stdev / (RUN_COUNT - 1));
> > >> +
> > >> + char buf[256];
> > >> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name,
> > >> + non_crt_len, crt_len, num_threads);
> > >> +
> > >> + json_attr_object_begin (js, buf);
> > >> +
> > >> + json_attr_double (js, "duration", (double) cur);
> > >> + json_attr_double (js, "iterations", (double) total_iters);
> > >> + json_attr_double (js, "mean", mean);
> > >> + json_attr_double (js, "stdev", stdev);
> > >> + json_attr_double (js, "min-outlier",
> > >> + (double) curs[0] / (double) total_iters);
> > >> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
> > >> + json_attr_double (js, "max",
> > >> + (double) curs[RUN_COUNT] / (double) total_iters);
> > >> + json_attr_double (js, "max-outlier",
> > >> + (double) curs[RUN_COUNT + 1] / (double) total_iters);
> > >> +
> > >> + json_attr_object_end (js);
> > >> +}
> > >> +
> > >> +#define TH_CONF_MAX 10
> > >> +
> > >> +int
> > >> +do_bench (void)
> > >> +{
> > >> + int rv = 0;
> > >> + json_ctx_t json_ctx;
> > >> + int i, j, k;
> > >> + int th_num, th_conf, nprocs;
> > >> + int threads[TH_CONF_MAX];
> > >> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
> > >> + int non_crt_lens[] = { 1, 32, 128 };
> > >> + char name[128];
> > >> +
> > >> + json_init (&json_ctx, 2, stdout);
> > >> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks");
> > >> +
> > >> + /* The thread config begins from 1, and increases by 2x until nprocs.
> > >> + We also wants to test over-saturation case (1.25*nprocs). */
> > >> + nprocs = get_nprocs ();
> > >> + th_num = 1;
> > >> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
> > >> + {
> > >> + threads[th_conf] = th_num;
> > >> + th_num <<= 1;
> > >> + }
> > >> + threads[th_conf++] = nprocs;
> > >> + threads[th_conf++] = nprocs + nprocs / 4;
> > >> +
> > >> + pthread_mutexattr_init (&attr);
> > >> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> > >> + snprintf (name, sizeof name, "type=adaptive");
> > >> +
> > >> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
> > >> + {
> > >> + int non_crt_len = non_crt_lens[k];
> > >> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
> > >> + {
> > >> + int crt_len = crt_lens[j];
> > >> + for (i = 0; i < th_conf; i++)
> > >> + {
> > >> + th_num = threads[i];
> > >> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx);
> > >> + }
> > >> + }
> > >> + }
> > >> +
> > >> + json_attr_object_end (&json_ctx);
> > >> +
> > >> + return rv;
> > >> +}
> > >> +
> > >> +#define TEST_FUNCTION do_bench ()
> > >> +
> > >> +#include "../test-skeleton.c"
> > >> --
> > >> 2.35.1
> > >>
> > >
> > > Can you run clang-format on this? Otherwise
> > > LGTM.
> > >
> >
> > clang-format done.
> > Nothing needs to change for this patch.
>
>
> Woops.
>
> LGTM.
I am pushing it now.
Thanks.
@@ -102,6 +102,7 @@ endif
bench-pthread := \
pthread-locks \
+ pthread-mutex-locks \
pthread_once \
thread_create \
# bench-pthread
@@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
$(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
$(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
$(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
+$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
new file mode 100644
@@ -0,0 +1,288 @@
+/* Measure mutex_lock for different threads and critical sections.
+ Copyright (C) 2020-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#define TEST_NAME "pthread-mutex-locks"
+#define TIMEOUT (20 * 60)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/sysinfo.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+static pthread_mutex_t lock;
+static pthread_mutexattr_t attr;
+static pthread_barrier_t barrier;
+
+#define START_ITERS 1000
+
+#pragma GCC push_options
+#pragma GCC optimize(1)
+
+static int __attribute__ ((noinline)) fibonacci (int i)
+{
+ asm("");
+ if (i > 2)
+ return fibonacci (i - 1) + fibonacci (i - 2);
+ return 10 + i;
+}
+
+static void
+do_filler (void)
+{
+ char buf1[512], buf2[512];
+ int f = fibonacci (4);
+ memcpy (buf1, buf2, f);
+}
+
+static void
+do_filler_shared (void)
+{
+ static char buf1[512], buf2[512];
+ int f = fibonacci (4);
+ memcpy (buf1, buf2, f);
+}
+
+#pragma GCC pop_options
+
+#define UNIT_WORK_CRT do_filler_shared ()
+#define UNIT_WORK_NON_CRT do_filler ()
+
+static inline void
+critical_section (int length)
+{
+ for (int i = length; i >= 0; i--)
+ UNIT_WORK_CRT;
+}
+
+static inline void
+non_critical_section (int length)
+{
+ for (int i = length; i >= 0; i--)
+ UNIT_WORK_NON_CRT;
+}
+
+typedef struct Worker_Params
+{
+ long iters;
+ int crt_len;
+ int non_crt_len;
+ timing_t duration;
+} Worker_Params;
+
+static void *
+worker (void *v)
+{
+ timing_t start, stop;
+ Worker_Params *p = (Worker_Params *) v;
+ long iters = p->iters;
+ int crt_len = p->crt_len;
+ int non_crt_len = p->non_crt_len;
+
+ pthread_barrier_wait (&barrier);
+ TIMING_NOW (start);
+ while (iters--)
+ {
+ pthread_mutex_lock (&lock);
+ critical_section (crt_len);
+ pthread_mutex_unlock (&lock);
+ non_critical_section (non_crt_len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (p->duration, start, stop);
+ return NULL;
+}
+
+static double
+do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
+{
+ int i;
+ timing_t mean;
+ Worker_Params *p, params[num_threads];
+ pthread_t threads[num_threads];
+
+ pthread_mutex_init (&lock, &attr);
+ pthread_barrier_init (&barrier, NULL, num_threads);
+
+ for (i = 0; i < num_threads; i++)
+ {
+ p = ¶ms[i];
+ p->iters = iters;
+ p->crt_len = crt_len;
+ p->non_crt_len = non_crt_len;
+ pthread_create (&threads[i], NULL, worker, (void *) p);
+ }
+ for (i = 0; i < num_threads; i++)
+ pthread_join (threads[i], NULL);
+
+ pthread_mutex_destroy (&lock);
+ pthread_barrier_destroy (&barrier);
+
+ mean = 0;
+ for (i = 0; i < num_threads; i++)
+ mean += params[i].duration;
+ mean /= num_threads;
+ return mean;
+}
+
+#define RUN_COUNT 10
+#define MIN_TEST_SEC 0.01
+
+static void
+do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len,
+ json_ctx_t *js)
+{
+ timing_t cur;
+ struct timeval ts, te;
+ double tsd, ted, td;
+ long iters, iters_limit, total_iters;
+ timing_t curs[RUN_COUNT + 2];
+ int i, j;
+ double mean, stdev;
+
+ iters = START_ITERS;
+ iters_limit = LONG_MAX / 100;
+
+ while (1)
+ {
+ gettimeofday (&ts, NULL);
+ cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
+ gettimeofday (&te, NULL);
+ /* Make sure the test to run at least MIN_TEST_SEC. */
+ tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
+ ted = te.tv_sec + te.tv_usec / 1000000.0;
+ td = ted - tsd;
+ if (td >= MIN_TEST_SEC || iters >= iters_limit)
+ break;
+
+ iters *= 10;
+ }
+
+ curs[0] = cur;
+ for (i = 1; i < RUN_COUNT + 2; i++)
+ curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
+
+ /* Sort the results so we can discard the fastest and slowest
+ times as outliers. */
+ for (i = 0; i < RUN_COUNT + 1; i++)
+ for (j = i + 1; j < RUN_COUNT + 2; j++)
+ if (curs[i] > curs[j])
+ {
+ timing_t temp = curs[i];
+ curs[i] = curs[j];
+ curs[j] = temp;
+ }
+
+ /* Calculate mean and standard deviation. */
+ mean = 0.0;
+ total_iters = iters * num_threads;
+ for (i = 1; i < RUN_COUNT + 1; i++)
+ mean += (double) curs[i] / (double) total_iters;
+ mean /= RUN_COUNT;
+
+ stdev = 0.0;
+ for (i = 1; i < RUN_COUNT + 1; i++)
+ {
+ double s = (double) curs[i] / (double) total_iters - mean;
+ stdev += s * s;
+ }
+ stdev = sqrt (stdev / (RUN_COUNT - 1));
+
+ char buf[256];
+ snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name,
+ non_crt_len, crt_len, num_threads);
+
+ json_attr_object_begin (js, buf);
+
+ json_attr_double (js, "duration", (double) cur);
+ json_attr_double (js, "iterations", (double) total_iters);
+ json_attr_double (js, "mean", mean);
+ json_attr_double (js, "stdev", stdev);
+ json_attr_double (js, "min-outlier",
+ (double) curs[0] / (double) total_iters);
+ json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
+ json_attr_double (js, "max",
+ (double) curs[RUN_COUNT] / (double) total_iters);
+ json_attr_double (js, "max-outlier",
+ (double) curs[RUN_COUNT + 1] / (double) total_iters);
+
+ json_attr_object_end (js);
+}
+
+#define TH_CONF_MAX 10
+
+int
+do_bench (void)
+{
+ int rv = 0;
+ json_ctx_t json_ctx;
+ int i, j, k;
+ int th_num, th_conf, nprocs;
+ int threads[TH_CONF_MAX];
+ int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
+ int non_crt_lens[] = { 1, 32, 128 };
+ char name[128];
+
+ json_init (&json_ctx, 2, stdout);
+ json_attr_object_begin (&json_ctx, "pthread_mutex_locks");
+
+ /* The thread config begins from 1, and increases by 2x until nprocs.
+ We also wants to test over-saturation case (1.25*nprocs). */
+ nprocs = get_nprocs ();
+ th_num = 1;
+ for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
+ {
+ threads[th_conf] = th_num;
+ th_num <<= 1;
+ }
+ threads[th_conf++] = nprocs;
+ threads[th_conf++] = nprocs + nprocs / 4;
+
+ pthread_mutexattr_init (&attr);
+ pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+ snprintf (name, sizeof name, "type=adaptive");
+
+ for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
+ {
+ int non_crt_len = non_crt_lens[k];
+ for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
+ {
+ int crt_len = crt_lens[j];
+ for (i = 0; i < th_conf; i++)
+ {
+ th_num = threads[i];
+ do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx);
+ }
+ }
+ }
+
+ json_attr_object_end (&json_ctx);
+
+ return rv;
+}
+
+#define TEST_FUNCTION do_bench ()
+
+#include "../test-skeleton.c"