benchtests: Add pthread-mutex-locks bench

Message ID 20220420054848.2774374-1-wangyang.guo@intel.com
State Superseded
Headers
Series benchtests: Add pthread-mutex-locks bench |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Wangyang Guo April 20, 2022, 5:48 a.m. UTC
  Benchmark for testing pthread mutex locks performance with different
threads and critical sections.

The test configuration consists of 3 parts:
1. thread number
2. critical-section length
3. non-critical-section length

Thread number starts from 1 and increased by 2x until num of CPU cores
(nprocs). An additional over-saturation case (1.25 * nprocs) is also
included.
Critical-section is represented by a loop of shared do_filler(),
length can be determined by the loop iters.
Non-critical-section is similiar to the critical-section, except it's
based on non-shared do_filler().

Currently, adaptive pthread_mutex lock is tested.
---
 benchtests/Makefile                    |   2 +
 benchtests/bench-pthread-mutex-locks.c | 297 +++++++++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 benchtests/bench-pthread-mutex-locks.c
  

Comments

Noah Goldstein April 20, 2022, 5:57 p.m. UTC | #1
On Wed, Apr 20, 2022 at 12:49 AM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> Benchmark for testing pthread mutex locks performance with different
> threads and critical sections.
>
> The test configuration consists of 3 parts:
> 1. thread number
> 2. critical-section length
> 3. non-critical-section length
>
> Thread number starts from 1 and increased by 2x until num of CPU cores
> (nprocs). An additional over-saturation case (1.25 * nprocs) is also
> included.
> Critical-section is represented by a loop of shared do_filler(),
> length can be determined by the loop iters.
> Non-critical-section is similiar to the critical-section, except it's
> based on non-shared do_filler().
>
> Currently, adaptive pthread_mutex lock is tested.
> ---
>  benchtests/Makefile                    |   2 +
>  benchtests/bench-pthread-mutex-locks.c | 297 +++++++++++++++++++++++++
>  2 files changed, 299 insertions(+)
>  create mode 100644 benchtests/bench-pthread-mutex-locks.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 8dfca592fd..b477042e6c 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -102,6 +102,7 @@ endif
>
>  bench-pthread := \
>    pthread-locks \
> +  pthread-mutex-locks \
>    pthread_once \
>    thread_create \
>  # bench-pthread
> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
>  $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
>  $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
>  $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
>
>
>
> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
> new file mode 100644
> index 0000000000..76f7b43635
> --- /dev/null
> +++ b/benchtests/bench-pthread-mutex-locks.c
> @@ -0,0 +1,297 @@
> +/* Measure mutex_lock for different threads and critical sections.
> +   Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "pthread-mutex-locks"
> +#define TIMEOUT (20 * 60)
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <math.h>
> +#include <pthread.h>
> +#include <sys/time.h>
> +#include <sys/sysinfo.h>
> +#include "bench-timing.h"
> +#include "json-lib.h"
> +
> +static pthread_mutex_t lock;
> +static pthread_mutexattr_t attr;
> +static pthread_barrier_t barrier;
> +
> +#define START_ITERS 1000
> +
> +#pragma GCC push_options
> +#pragma GCC optimize(1)
> +
> +static int __attribute__ ((noinline)) fibonacci (int i)
> +{
> +  asm("");
> +  if (i > 2)
> +    return fibonacci (i - 1) + fibonacci (i - 2);
> +  return 10 + i;
> +}
> +
> +static void
> +do_filler (void)
> +{
> +  char buf1[512], buf2[512];
> +  int f = fibonacci (4);
> +  memcpy (buf1, buf2, f);
> +}
> +
> +static void
> +do_filler_shared (void)
> +{
> +  static char buf1[512], buf2[512];
> +  int f = fibonacci (4);
> +  memcpy (buf1, buf2, f);
> +}
> +
> +#pragma GCC pop_options
> +
> +#define UNIT_WORK_CRT do_filler_shared ()
> +#define UNIT_WORK_NON_CRT do_filler ()
> +
> +static inline void
> +critical_section (int length)
> +{
> +  for (int i = length; i >= 0; i--)
> +    UNIT_WORK_CRT;
> +}
> +
> +static inline void
> +non_critical_section (int length)
> +{
> +  for (int i = length; i >= 0; i--)
> +    UNIT_WORK_NON_CRT;
> +}
> +
> +typedef struct Worker_Params
> +{
> +  long iters;
> +  int crt_len;
> +  int non_crt_len;
> +  timing_t duration;
> +} Worker_Params;
> +
> +static void *
> +worker (void *v)
> +{
> +  timing_t start, stop;
> +  Worker_Params *p = (Worker_Params *) v;
> +  long iters = p->iters;
> +  int crt_len = p->crt_len;
> +  int non_crt_len = p->non_crt_len;
> +
> +  pthread_barrier_wait (&barrier);
> +  TIMING_NOW (start);
> +  while (iters--)
> +    {
> +      pthread_mutex_lock (&lock);
> +      critical_section (crt_len);
> +      pthread_mutex_unlock (&lock);
> +      non_critical_section (non_crt_len);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (p->duration, start, stop);
> +  return NULL;
> +}
> +
> +static double
> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
> +{
> +  int i;
> +  timing_t mean;
> +  Worker_Params *p, params[num_threads];
> +  pthread_t threads[num_threads];
> +
> +  pthread_mutex_init (&lock, &attr);
> +  pthread_barrier_init (&barrier, NULL, num_threads);
> +
> +  for (i = 0; i < num_threads; i++)
> +    {
> +      p = &params[i];
> +      p->iters = iters;
> +      p->crt_len = crt_len;
> +      p->non_crt_len = non_crt_len;
> +      pthread_create (&threads[i], NULL, worker, (void *) p);
> +    }
> +  for (i = 0; i < num_threads; i++)
> +    pthread_join (threads[i], NULL);
> +
> +  pthread_mutex_destroy (&lock);
> +  pthread_barrier_destroy (&barrier);
> +
> +  mean = 0;
> +  for (i = 0; i < num_threads; i++)
> +    mean += params[i].duration;
> +  mean /= num_threads;
> +  return mean;
> +}
> +
> +#define RUN_COUNT 10
> +#define MIN_TEST_SEC 0.01
> +
> +static void
> +do_bench_1 (int num_threads, int crt_len, int non_crt_len, json_ctx_t *js)
> +{
> +  timing_t cur;
> +  struct timeval ts, te;
> +  double tsd, ted, td;
> +  long iters, iters_limit, total_iters;
> +  timing_t curs[RUN_COUNT + 2];
> +  int i, j;
> +  double mean, stdev;
> +
> +  iters = START_ITERS;
> +  iters_limit = LONG_MAX / 100;
> +
> +  while (1)
> +    {
> +      gettimeofday (&ts, NULL);
> +      cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
> +      gettimeofday (&te, NULL);
> +      /* Make sure the test to run at least MIN_TEST_SEC.  */
> +      tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
> +      ted = te.tv_sec + te.tv_usec / 1000000.0;
> +      td = ted - tsd;
> +      if (td >= MIN_TEST_SEC || iters >= iters_limit)
> +       break;
> +
> +      iters *= 10;
> +    }
> +
> +  curs[0] = cur;
> +  for (i = 1; i < RUN_COUNT + 2; i++)
> +    curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
> +
> +  /* Sort the results so we can discard the fastest and slowest
> +     times as outliers.  */
> +  for (i = 0; i < RUN_COUNT + 1; i++)
> +    for (j = i + 1; j < RUN_COUNT + 2; j++)
> +      if (curs[i] > curs[j])
> +       {
> +         timing_t temp = curs[i];
> +         curs[i] = curs[j];
> +         curs[j] = temp;
> +       }
> +
> +  /* Calculate mean and standard deviation.  */
> +  mean = 0.0;
> +  total_iters = iters * num_threads;
> +  for (i = 1; i < RUN_COUNT + 1; i++)
> +    mean += (double) curs[i] / (double) total_iters;
> +  mean /= RUN_COUNT;
> +
> +  stdev = 0.0;
> +  for (i = 1; i < RUN_COUNT + 1; i++)
> +    {
> +      double s = (double) curs[i] / (double) total_iters - mean;
> +      stdev += s * s;
> +    }
> +  stdev = sqrt (stdev / (RUN_COUNT - 1));
> +
> +  json_element_object_begin (js);
> +  json_attr_uint (js, "thread", num_threads);
> +  json_attr_double (js, "mean", mean);
> +  json_attr_double (js, "stdev", stdev);
> +  json_attr_double (js, "min-outlier",
> +                   (double) curs[0] / (double) total_iters);
> +  json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
> +  json_attr_double (js, "max",
> +                   (double) curs[RUN_COUNT] / (double) total_iters);
> +  json_attr_double (js, "max-outlier",
> +                   (double) curs[RUN_COUNT + 1] / (double) total_iters);
> +  json_element_object_end (js);
> +}
> +
> +#define TH_CONF_MAX 10
> +
> +int
> +do_bench (void)
> +{
> +  int rv = 0;
> +  json_ctx_t json_ctx;
> +  int i, j, k;
> +  int th_num, th_conf, nprocs;
> +  int threads[TH_CONF_MAX];
> +  int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
> +  int non_crt_lens[] = { 1, 32, 128 };
> +
> +  json_init (&json_ctx, 2, stdout);
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  /* The thread config begins from 1, and increases by 2x until nprocs.
> +     We also wants to test over-saturation case (1.25*nprocs).  */
> +  nprocs = get_nprocs ();
> +  th_num = 1;
> +  for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
> +    {
> +      threads[th_conf] = th_num;
> +      th_num <<= 1;
> +    }
> +  threads[th_conf++] = nprocs;
> +  threads[th_conf++] = nprocs + nprocs / 4;
> +
> +  json_array_begin (&json_ctx, "threads");
> +  for (i = 0; i < th_conf; i++)
> +    json_element_int (&json_ctx, threads[i]);
> +  json_array_end (&json_ctx);
> +
> +  pthread_mutexattr_init (&attr);
> +  pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> +  json_attr_string (&json_ctx, "lock-type", "adaptive-mutex");
> +
> +  json_array_begin (&json_ctx, "non-critical-sections");
> +  for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
> +    {
> +      int non_crt_len = non_crt_lens[k];
> +      json_element_object_begin (&json_ctx);
> +      json_attr_uint (&json_ctx, "non-critical-length", non_crt_len);
> +      json_array_begin (&json_ctx, "critical-sections");
> +      for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
> +       {
> +         int crt_len = crt_lens[j];
> +         json_element_object_begin (&json_ctx);
> +         json_attr_uint (&json_ctx, "critical-length", crt_len);
> +         json_array_begin (&json_ctx, "results");
> +         for (i = 0; i < th_conf; i++)
> +           {
> +             th_num = threads[i];
> +             do_bench_1 (th_num, crt_len, non_crt_len, &json_ctx);
> +           }
> +         json_array_end (&json_ctx);
> +         json_element_object_end (&json_ctx);
> +       }
> +      json_array_end (&json_ctx);
> +      json_element_object_end (&json_ctx);
> +    }
> +  json_array_end (&json_ctx);
> +
> +  json_document_end (&json_ctx);
> +
> +  return rv;
> +}
> +
> +#define TEST_FUNCTION do_bench ()
> +
> +#include "../test-skeleton.c"
> --
> 2.35.1
>

When I run this I get the following error:

$> (cd /home/noah/programs/opensource/glibc-dev/build/glibc/; unset
LD_LIBRARY_PATH; make --silent; make  bench
BENCHSET="bench-pthread";);

Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread-locks
Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread-mutex-locks
Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread_once
Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-thread_create
Benchmark output in
/home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench.out
is not JSON.
make[1]: *** [Makefile:412: bench-func] Error 65
rm /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-thread_create.c
/home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread_once.c
make[1]: Leaving directory
'/home/noah/programs/opensource/glibc-dev/src/glibc/benchtests'
make: *** [Makefile:16: bench] Error 2
  
develop--- via Libc-alpha April 21, 2022, 1:47 a.m. UTC | #2
On 4/21/2022 1:57 AM, Noah Goldstein via Libc-alpha wrote:
> On Wed, Apr 20, 2022 at 12:49 AM Wangyang Guo <wangyang.guo@intel.com> wrote:
>>
>> Benchmark for testing pthread mutex locks performance with different
>> threads and critical sections.
>>
>> The test configuration consists of 3 parts:
>> 1. thread number
>> 2. critical-section length
>> 3. non-critical-section length
>>
>> Thread number starts from 1 and increased by 2x until num of CPU cores
>> (nprocs). An additional over-saturation case (1.25 * nprocs) is also
>> included.
>> Critical-section is represented by a loop of shared do_filler(),
>> length can be determined by the loop iters.
>> Non-critical-section is similiar to the critical-section, except it's
>> based on non-shared do_filler().
>>
>> Currently, adaptive pthread_mutex lock is tested.
>> ---
>>   benchtests/Makefile                    |   2 +
>>   benchtests/bench-pthread-mutex-locks.c | 297 +++++++++++++++++++++++++
>>   2 files changed, 299 insertions(+)
>>   create mode 100644 benchtests/bench-pthread-mutex-locks.c
>>
>> diff --git a/benchtests/Makefile b/benchtests/Makefile
>> index 8dfca592fd..b477042e6c 100644
>> --- a/benchtests/Makefile
>> +++ b/benchtests/Makefile
>> @@ -102,6 +102,7 @@ endif
>>
>>   bench-pthread := \
>>     pthread-locks \
>> +  pthread-mutex-locks \
>>     pthread_once \
>>     thread_create \
>>   # bench-pthread
>> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
>>   $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
>>   $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
>>   $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
>> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
>>
>>
>>
>> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
>> new file mode 100644
>> index 0000000000..76f7b43635
>> --- /dev/null
>> +++ b/benchtests/bench-pthread-mutex-locks.c
>> @@ -0,0 +1,297 @@
>> +/* Measure mutex_lock for different threads and critical sections.
>> +   Copyright (C) 2020-2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#define TEST_MAIN
>> +#define TEST_NAME "pthread-mutex-locks"
>> +#define TIMEOUT (20 * 60)
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include <math.h>
>> +#include <pthread.h>
>> +#include <sys/time.h>
>> +#include <sys/sysinfo.h>
>> +#include "bench-timing.h"
>> +#include "json-lib.h"
>> +
>> +static pthread_mutex_t lock;
>> +static pthread_mutexattr_t attr;
>> +static pthread_barrier_t barrier;
>> +
>> +#define START_ITERS 1000
>> +
>> +#pragma GCC push_options
>> +#pragma GCC optimize(1)
>> +
>> +static int __attribute__ ((noinline)) fibonacci (int i)
>> +{
>> +  asm("");
>> +  if (i > 2)
>> +    return fibonacci (i - 1) + fibonacci (i - 2);
>> +  return 10 + i;
>> +}
>> +
>> +static void
>> +do_filler (void)
>> +{
>> +  char buf1[512], buf2[512];
>> +  int f = fibonacci (4);
>> +  memcpy (buf1, buf2, f);
>> +}
>> +
>> +static void
>> +do_filler_shared (void)
>> +{
>> +  static char buf1[512], buf2[512];
>> +  int f = fibonacci (4);
>> +  memcpy (buf1, buf2, f);
>> +}
>> +
>> +#pragma GCC pop_options
>> +
>> +#define UNIT_WORK_CRT do_filler_shared ()
>> +#define UNIT_WORK_NON_CRT do_filler ()
>> +
>> +static inline void
>> +critical_section (int length)
>> +{
>> +  for (int i = length; i >= 0; i--)
>> +    UNIT_WORK_CRT;
>> +}
>> +
>> +static inline void
>> +non_critical_section (int length)
>> +{
>> +  for (int i = length; i >= 0; i--)
>> +    UNIT_WORK_NON_CRT;
>> +}
>> +
>> +typedef struct Worker_Params
>> +{
>> +  long iters;
>> +  int crt_len;
>> +  int non_crt_len;
>> +  timing_t duration;
>> +} Worker_Params;
>> +
>> +static void *
>> +worker (void *v)
>> +{
>> +  timing_t start, stop;
>> +  Worker_Params *p = (Worker_Params *) v;
>> +  long iters = p->iters;
>> +  int crt_len = p->crt_len;
>> +  int non_crt_len = p->non_crt_len;
>> +
>> +  pthread_barrier_wait (&barrier);
>> +  TIMING_NOW (start);
>> +  while (iters--)
>> +    {
>> +      pthread_mutex_lock (&lock);
>> +      critical_section (crt_len);
>> +      pthread_mutex_unlock (&lock);
>> +      non_critical_section (non_crt_len);
>> +    }
>> +  TIMING_NOW (stop);
>> +
>> +  TIMING_DIFF (p->duration, start, stop);
>> +  return NULL;
>> +}
>> +
>> +static double
>> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
>> +{
>> +  int i;
>> +  timing_t mean;
>> +  Worker_Params *p, params[num_threads];
>> +  pthread_t threads[num_threads];
>> +
>> +  pthread_mutex_init (&lock, &attr);
>> +  pthread_barrier_init (&barrier, NULL, num_threads);
>> +
>> +  for (i = 0; i < num_threads; i++)
>> +    {
>> +      p = &params[i];
>> +      p->iters = iters;
>> +      p->crt_len = crt_len;
>> +      p->non_crt_len = non_crt_len;
>> +      pthread_create (&threads[i], NULL, worker, (void *) p);
>> +    }
>> +  for (i = 0; i < num_threads; i++)
>> +    pthread_join (threads[i], NULL);
>> +
>> +  pthread_mutex_destroy (&lock);
>> +  pthread_barrier_destroy (&barrier);
>> +
>> +  mean = 0;
>> +  for (i = 0; i < num_threads; i++)
>> +    mean += params[i].duration;
>> +  mean /= num_threads;
>> +  return mean;
>> +}
>> +
>> +#define RUN_COUNT 10
>> +#define MIN_TEST_SEC 0.01
>> +
>> +static void
>> +do_bench_1 (int num_threads, int crt_len, int non_crt_len, json_ctx_t *js)
>> +{
>> +  timing_t cur;
>> +  struct timeval ts, te;
>> +  double tsd, ted, td;
>> +  long iters, iters_limit, total_iters;
>> +  timing_t curs[RUN_COUNT + 2];
>> +  int i, j;
>> +  double mean, stdev;
>> +
>> +  iters = START_ITERS;
>> +  iters_limit = LONG_MAX / 100;
>> +
>> +  while (1)
>> +    {
>> +      gettimeofday (&ts, NULL);
>> +      cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
>> +      gettimeofday (&te, NULL);
>> +      /* Make sure the test to run at least MIN_TEST_SEC.  */
>> +      tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
>> +      ted = te.tv_sec + te.tv_usec / 1000000.0;
>> +      td = ted - tsd;
>> +      if (td >= MIN_TEST_SEC || iters >= iters_limit)
>> +       break;
>> +
>> +      iters *= 10;
>> +    }
>> +
>> +  curs[0] = cur;
>> +  for (i = 1; i < RUN_COUNT + 2; i++)
>> +    curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
>> +
>> +  /* Sort the results so we can discard the fastest and slowest
>> +     times as outliers.  */
>> +  for (i = 0; i < RUN_COUNT + 1; i++)
>> +    for (j = i + 1; j < RUN_COUNT + 2; j++)
>> +      if (curs[i] > curs[j])
>> +       {
>> +         timing_t temp = curs[i];
>> +         curs[i] = curs[j];
>> +         curs[j] = temp;
>> +       }
>> +
>> +  /* Calculate mean and standard deviation.  */
>> +  mean = 0.0;
>> +  total_iters = iters * num_threads;
>> +  for (i = 1; i < RUN_COUNT + 1; i++)
>> +    mean += (double) curs[i] / (double) total_iters;
>> +  mean /= RUN_COUNT;
>> +
>> +  stdev = 0.0;
>> +  for (i = 1; i < RUN_COUNT + 1; i++)
>> +    {
>> +      double s = (double) curs[i] / (double) total_iters - mean;
>> +      stdev += s * s;
>> +    }
>> +  stdev = sqrt (stdev / (RUN_COUNT - 1));
>> +
>> +  json_element_object_begin (js);
>> +  json_attr_uint (js, "thread", num_threads);
>> +  json_attr_double (js, "mean", mean);
>> +  json_attr_double (js, "stdev", stdev);
>> +  json_attr_double (js, "min-outlier",
>> +                   (double) curs[0] / (double) total_iters);
>> +  json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
>> +  json_attr_double (js, "max",
>> +                   (double) curs[RUN_COUNT] / (double) total_iters);
>> +  json_attr_double (js, "max-outlier",
>> +                   (double) curs[RUN_COUNT + 1] / (double) total_iters);
>> +  json_element_object_end (js);
>> +}
>> +
>> +#define TH_CONF_MAX 10
>> +
>> +int
>> +do_bench (void)
>> +{
>> +  int rv = 0;
>> +  json_ctx_t json_ctx;
>> +  int i, j, k;
>> +  int th_num, th_conf, nprocs;
>> +  int threads[TH_CONF_MAX];
>> +  int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
>> +  int non_crt_lens[] = { 1, 32, 128 };
>> +
>> +  json_init (&json_ctx, 2, stdout);
>> +  json_document_begin (&json_ctx);
>> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
>> +
>> +  /* The thread config begins from 1, and increases by 2x until nprocs.
>> +     We also wants to test over-saturation case (1.25*nprocs).  */
>> +  nprocs = get_nprocs ();
>> +  th_num = 1;
>> +  for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
>> +    {
>> +      threads[th_conf] = th_num;
>> +      th_num <<= 1;
>> +    }
>> +  threads[th_conf++] = nprocs;
>> +  threads[th_conf++] = nprocs + nprocs / 4;
>> +
>> +  json_array_begin (&json_ctx, "threads");
>> +  for (i = 0; i < th_conf; i++)
>> +    json_element_int (&json_ctx, threads[i]);
>> +  json_array_end (&json_ctx);
>> +
>> +  pthread_mutexattr_init (&attr);
>> +  pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
>> +  json_attr_string (&json_ctx, "lock-type", "adaptive-mutex");
>> +
>> +  json_array_begin (&json_ctx, "non-critical-sections");
>> +  for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
>> +    {
>> +      int non_crt_len = non_crt_lens[k];
>> +      json_element_object_begin (&json_ctx);
>> +      json_attr_uint (&json_ctx, "non-critical-length", non_crt_len);
>> +      json_array_begin (&json_ctx, "critical-sections");
>> +      for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
>> +       {
>> +         int crt_len = crt_lens[j];
>> +         json_element_object_begin (&json_ctx);
>> +         json_attr_uint (&json_ctx, "critical-length", crt_len);
>> +         json_array_begin (&json_ctx, "results");
>> +         for (i = 0; i < th_conf; i++)
>> +           {
>> +             th_num = threads[i];
>> +             do_bench_1 (th_num, crt_len, non_crt_len, &json_ctx);
>> +           }
>> +         json_array_end (&json_ctx);
>> +         json_element_object_end (&json_ctx);
>> +       }
>> +      json_array_end (&json_ctx);
>> +      json_element_object_end (&json_ctx);
>> +    }
>> +  json_array_end (&json_ctx);
>> +
>> +  json_document_end (&json_ctx);
>> +
>> +  return rv;
>> +}
>> +
>> +#define TEST_FUNCTION do_bench ()
>> +
>> +#include "../test-skeleton.c"
>> --
>> 2.35.1
>>
> 
> When I run this I get the following error:
> 
> $> (cd /home/noah/programs/opensource/glibc-dev/build/glibc/; unset
> LD_LIBRARY_PATH; make --silent; make  bench
> BENCHSET="bench-pthread";);
> 
> Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread-locks
> Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread-mutex-locks
> Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread_once
> Running /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-thread_create
> Benchmark output in
> /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench.out
> is not JSON.
> make[1]: *** [Makefile:412: bench-func] Error 65
> rm /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-thread_create.c
> /home/noah/programs/opensource/glibc-dev/build/glibc/benchtests/bench-pthread_once.c
> make[1]: Leaving directory
> '/home/noah/programs/opensource/glibc-dev/src/glibc/benchtests'
> make: *** [Makefile:16: bench] Error 2
> 

I am able to reproduce this after install python jsonschema module.
The json output need to be a "key": {...} format in order to fit in 
bench.out.
I will fix it.
  

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..b477042e6c 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -102,6 +102,7 @@  endif
 
 bench-pthread := \
   pthread-locks \
+  pthread-mutex-locks \
   pthread_once \
   thread_create \
 # bench-pthread
@@ -281,6 +282,7 @@  $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests)
 $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests)
 $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests)
+$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests)
 
 
 
diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c
new file mode 100644
index 0000000000..76f7b43635
--- /dev/null
+++ b/benchtests/bench-pthread-mutex-locks.c
@@ -0,0 +1,297 @@ 
+/* Measure mutex_lock for different threads and critical sections.
+   Copyright (C) 2020-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "pthread-mutex-locks"
+#define TIMEOUT (20 * 60)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/sysinfo.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+static pthread_mutex_t lock;
+static pthread_mutexattr_t attr;
+static pthread_barrier_t barrier;
+
+#define START_ITERS 1000
+
+#pragma GCC push_options
+#pragma GCC optimize(1)
+
+static int __attribute__ ((noinline)) fibonacci (int i)
+{
+  asm("");
+  if (i > 2)
+    return fibonacci (i - 1) + fibonacci (i - 2);
+  return 10 + i;
+}
+
+static void
+do_filler (void)
+{
+  char buf1[512], buf2[512];
+  int f = fibonacci (4);
+  memcpy (buf1, buf2, f);
+}
+
+static void
+do_filler_shared (void)
+{
+  static char buf1[512], buf2[512];
+  int f = fibonacci (4);
+  memcpy (buf1, buf2, f);
+}
+
+#pragma GCC pop_options
+
+#define UNIT_WORK_CRT do_filler_shared ()
+#define UNIT_WORK_NON_CRT do_filler ()
+
+static inline void
+critical_section (int length)
+{
+  for (int i = length; i >= 0; i--)
+    UNIT_WORK_CRT;
+}
+
+static inline void
+non_critical_section (int length)
+{
+  for (int i = length; i >= 0; i--)
+    UNIT_WORK_NON_CRT;
+}
+
+typedef struct Worker_Params
+{
+  long iters;
+  int crt_len;
+  int non_crt_len;
+  timing_t duration;
+} Worker_Params;
+
+static void *
+worker (void *v)
+{
+  timing_t start, stop;
+  Worker_Params *p = (Worker_Params *) v;
+  long iters = p->iters;
+  int crt_len = p->crt_len;
+  int non_crt_len = p->non_crt_len;
+
+  pthread_barrier_wait (&barrier);
+  TIMING_NOW (start);
+  while (iters--)
+    {
+      pthread_mutex_lock (&lock);
+      critical_section (crt_len);
+      pthread_mutex_unlock (&lock);
+      non_critical_section (non_crt_len);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (p->duration, start, stop);
+  return NULL;
+}
+
+static double
+do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
+{
+  int i;
+  timing_t mean;
+  Worker_Params *p, params[num_threads];
+  pthread_t threads[num_threads];
+
+  pthread_mutex_init (&lock, &attr);
+  pthread_barrier_init (&barrier, NULL, num_threads);
+
+  for (i = 0; i < num_threads; i++)
+    {
+      p = &params[i];
+      p->iters = iters;
+      p->crt_len = crt_len;
+      p->non_crt_len = non_crt_len;
+      pthread_create (&threads[i], NULL, worker, (void *) p);
+    }
+  for (i = 0; i < num_threads; i++)
+    pthread_join (threads[i], NULL);
+
+  pthread_mutex_destroy (&lock);
+  pthread_barrier_destroy (&barrier);
+
+  mean = 0;
+  for (i = 0; i < num_threads; i++)
+    mean += params[i].duration;
+  mean /= num_threads;
+  return mean;
+}
+
+#define RUN_COUNT 10
+#define MIN_TEST_SEC 0.01
+
+static void
+do_bench_1 (int num_threads, int crt_len, int non_crt_len, json_ctx_t *js)
+{
+  timing_t cur;
+  struct timeval ts, te;
+  double tsd, ted, td;
+  long iters, iters_limit, total_iters;
+  timing_t curs[RUN_COUNT + 2];
+  int i, j;
+  double mean, stdev;
+
+  iters = START_ITERS;
+  iters_limit = LONG_MAX / 100;
+
+  while (1)
+    {
+      gettimeofday (&ts, NULL);
+      cur = do_one_test (num_threads, crt_len, non_crt_len, iters);
+      gettimeofday (&te, NULL);
+      /* Make sure the test to run at least MIN_TEST_SEC.  */
+      tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
+      ted = te.tv_sec + te.tv_usec / 1000000.0;
+      td = ted - tsd;
+      if (td >= MIN_TEST_SEC || iters >= iters_limit)
+	break;
+
+      iters *= 10;
+    }
+
+  curs[0] = cur;
+  for (i = 1; i < RUN_COUNT + 2; i++)
+    curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters);
+
+  /* Sort the results so we can discard the fastest and slowest
+     times as outliers.  */
+  for (i = 0; i < RUN_COUNT + 1; i++)
+    for (j = i + 1; j < RUN_COUNT + 2; j++)
+      if (curs[i] > curs[j])
+	{
+	  timing_t temp = curs[i];
+	  curs[i] = curs[j];
+	  curs[j] = temp;
+	}
+
+  /* Calculate mean and standard deviation.  */
+  mean = 0.0;
+  total_iters = iters * num_threads;
+  for (i = 1; i < RUN_COUNT + 1; i++)
+    mean += (double) curs[i] / (double) total_iters;
+  mean /= RUN_COUNT;
+
+  stdev = 0.0;
+  for (i = 1; i < RUN_COUNT + 1; i++)
+    {
+      double s = (double) curs[i] / (double) total_iters - mean;
+      stdev += s * s;
+    }
+  stdev = sqrt (stdev / (RUN_COUNT - 1));
+
+  json_element_object_begin (js);
+  json_attr_uint (js, "thread", num_threads);
+  json_attr_double (js, "mean", mean);
+  json_attr_double (js, "stdev", stdev);
+  json_attr_double (js, "min-outlier",
+		    (double) curs[0] / (double) total_iters);
+  json_attr_double (js, "min", (double) curs[1] / (double) total_iters);
+  json_attr_double (js, "max",
+		    (double) curs[RUN_COUNT] / (double) total_iters);
+  json_attr_double (js, "max-outlier",
+		    (double) curs[RUN_COUNT + 1] / (double) total_iters);
+  json_element_object_end (js);
+}
+
+#define TH_CONF_MAX 10
+
+int
+do_bench (void)
+{
+  int rv = 0;
+  json_ctx_t json_ctx;
+  int i, j, k;
+  int th_num, th_conf, nprocs;
+  int threads[TH_CONF_MAX];
+  int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 };
+  int non_crt_lens[] = { 1, 32, 128 };
+
+  json_init (&json_ctx, 2, stdout);
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  /* The thread config begins from 1, and increases by 2x until nprocs.
+     We also wants to test over-saturation case (1.25*nprocs).  */
+  nprocs = get_nprocs ();
+  th_num = 1;
+  for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++)
+    {
+      threads[th_conf] = th_num;
+      th_num <<= 1;
+    }
+  threads[th_conf++] = nprocs;
+  threads[th_conf++] = nprocs + nprocs / 4;
+
+  json_array_begin (&json_ctx, "threads");
+  for (i = 0; i < th_conf; i++)
+    json_element_int (&json_ctx, threads[i]);
+  json_array_end (&json_ctx);
+
+  pthread_mutexattr_init (&attr);
+  pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+  json_attr_string (&json_ctx, "lock-type", "adaptive-mutex");
+
+  json_array_begin (&json_ctx, "non-critical-sections");
+  for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
+    {
+      int non_crt_len = non_crt_lens[k];
+      json_element_object_begin (&json_ctx);
+      json_attr_uint (&json_ctx, "non-critical-length", non_crt_len);
+      json_array_begin (&json_ctx, "critical-sections");
+      for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++)
+	{
+	  int crt_len = crt_lens[j];
+	  json_element_object_begin (&json_ctx);
+	  json_attr_uint (&json_ctx, "critical-length", crt_len);
+	  json_array_begin (&json_ctx, "results");
+	  for (i = 0; i < th_conf; i++)
+	    {
+	      th_num = threads[i];
+	      do_bench_1 (th_num, crt_len, non_crt_len, &json_ctx);
+	    }
+	  json_array_end (&json_ctx);
+	  json_element_object_end (&json_ctx);
+	}
+      json_array_end (&json_ctx);
+      json_element_object_end (&json_ctx);
+    }
+  json_array_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+
+  return rv;
+}
+
+#define TEST_FUNCTION do_bench ()
+
+#include "../test-skeleton.c"