[v2,2/5] benchtests: Add memset zero fill benchtest

Message ID 20210720063500.362313-1-naohirot@fujitsu.com
State Deferred
Headers
Series benchtests: Add memset zero fill benchmark tests |

Checks

Context Check Description
dj/TryBot-apply_patch fail Patch failed to apply to master at the time it was sent

Commit Message

Naohiro Tamura July 20, 2021, 6:35 a.m. UTC
  Memset takes 0 as the second parameter in most cases.
However, we cannot measure the zero fill performance by bench-memset.c
and bench-memset-large.c precisely.
X86_64 micro-architecture has some zero-over-zero optimization, and
AArch64 micro-architecture also has some optimization for DC ZVA
instruction.
This patch provides bench-memset-zerofill.c which is suitable to
analyze the zero fill performance by zero-over-zero and zero-over-one
test cases from 16KB(L1), through L2 and L3, to 64MB(RAM).
---
 benchtests/Makefile                |   2 +-
 benchtests/bench-memset-zerofill.c | 128 +++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memset-zerofill.c
  

Comments

Noah Goldstein July 20, 2021, 4:48 p.m. UTC | #1
On Tue, Jul 20, 2021 at 2:35 AM Naohiro Tamura <naohirot@fujitsu.com> wrote:

> Memset takes 0 as the second parameter in most cases.
> However, we cannot measure the zero fill performance by bench-memset.c
> and bench-memset-large.c precisely.
> X86_64 micro-architecture has some zero-over-zero optimization, and
> AArch64 micro-architecture also has some optimization for DC ZVA
> instruction.
> This patch provides bench-memset-zerofill.c which is suitable to
> analyze the zero fill performance by zero-over-zero and zero-over-one
> test cases from 16KB(L1), through L2 and L3, to 64MB(RAM).
> ---
>  benchtests/Makefile                |   2 +-
>  benchtests/bench-memset-zerofill.c | 128 +++++++++++++++++++++++++++++
>  2 files changed, 129 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memset-zerofill.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..21b95c736190 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem
> memmove \
>                    strncasecmp strncat strncmp strncpy strnlen strpbrk
> strrchr \
>                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep
> strtok \
>                    strcoll memcpy-large memcpy-random memmove-large
> memset-large \
> -                  memcpy-walk memset-walk memmove-walk
> +                  memcpy-walk memset-walk memmove-walk memset-zerofill
>
>  # Build and run locale-dependent benchmarks only if we're building
> natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-zerofill.c
> b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..2579b6edd09e
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,128 @@
> +/* Measure memset functions with zero fill data.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE (16 * 1024)
> +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
>
Do we want __attribute__((noinline, noclone))?

> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> +            size_t n)
> +{
> +  size_t i, iters = 16;
>

I think 16 is probably too few iterations for reliable benchmarking.
Maybe `INNER_LOOP_ITERS` which is 8192

+  timing_t start, stop, cur;
> +
> +  TIMING_NOW (start);
> +  for (i = 0; i < iters; i += 2)
> +    {
> +      CALL (impl, s, c1, n);
>
I am a bit worried that the overhead from the first call with `c1` will
distort the results.
Is it possible to implement it with a nested loop where you fill `s` with
`c1` for
`n * inner_loop_iterations` in the outer loop and in the inner loop fill
`c2` on `s + n * i`?
In that case maybe 16 for inner loop iterations and 512 for outer loop
iterations.

> +      CALL (impl, s, c2, n);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> +{
> +  align &= 63;
>
Can you make this `align &= getpagesize () - 1;`?

> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char1", c1);
> +  json_attr_int (json_ctx, "char2", c2);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c1, c2;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  c2 = 0;
> +  for (c1 = 0; c1 < 2; c1++)
> +    for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +      {
> +       do_test (&json_ctx, 0, c1, c2, i);
> +       do_test (&json_ctx, 3, c1, c2, i);
> +      }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>
> --
> 2.17.1
>
>
  
develop--- via Libc-alpha July 21, 2021, 12:56 p.m. UTC | #2
Hi Noah,

Thank you for the review.

> > +#define TEST_MAIN
> > +#define TEST_NAME "memset"
> > +#define START_SIZE (16 * 1024)
> > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > +#define TIMEOUT (20 * 60)
> > +#include "bench-string.h"
> > +
> > +#include "json-lib.h"
> > +
> > +void *generic_memset (void *, int, size_t);
> > +typedef void *(*proto_t) (void *, int, size_t);
> > +
> > +IMPL (MEMSET, 1)
> > +IMPL (generic_memset, 0)
> > +
> > +static void
> Do we want __attribute__((noinline, noclone))? 

Yes, I'll add it.

> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> > +            size_t n)
> > +{
> > +  size_t i, iters = 16;
> 
> I think 16 is probably too few iterations for reliable benchmarking. 
> Maybe `INNER_LOOP_ITERS` which is 8192

I tried it. If it is changed to 8192, it hit the TIMEOUT (20 * 60) on a64fx.
Please check the code below.

> 
> > +  timing_t start, stop, cur;
> > +
> > +  TIMING_NOW (start);
> > +  for (i = 0; i < iters; i += 2)
> > +    {
> > +      CALL (impl, s, c1, n);
> I am a bit worried that the overhead from the first call with `c1` will distort the results.
> Is it possible to implement it with a nested loop where you fill `s` with `c1` for 
> `n * inner_loop_iterations` in the outer loop and in the inner loop fill `c2` on `s + n * i`? 
> In that case maybe 16 for inner loop iterations and 512 for outer loop iterations. 

It seems that we have to set smaller number if this implementation is not wrong.
Because it will take 99.4 minutes estimating from the case that "iters = 32"
took 23.3 seconds.
(8192/32*23.3/60=99.4)


#define START_SIZE (16 * 1024)
...
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = INNER_LOOP_ITERS; // 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
    {
      CALL (impl, s, c1, n * 16);
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}

> > +      CALL (impl, s, c2, n);
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > +}
> > +
> > +static void
> > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> > +{
> > +  align &= 63;
> Can you make this `align &= getpagesize () - 1;`?  

I'll change it.

Thanks.
Naohiro
  
develop--- via Libc-alpha July 21, 2021, 1:07 p.m. UTC | #3
Hi Noah,

One typo in the updated code.
Wrong:
  #define START_SIZE (16 * 1024)
Right:
  #define BUF1PAGES 16

Thanks
Naohiro
  
Noah Goldstein July 21, 2021, 6:14 p.m. UTC | #4
On Wed, Jul 21, 2021 at 9:07 AM naohirot@fujitsu.com <naohirot@fujitsu.com>
wrote:

> Hi Noah,
>
> One typo in the updated code.
> Wrong:
>   #define START_SIZE (16 * 1024)
> Right:
>   #define BUF1PAGES 16
>
> Thanks
> Naohiro
> ________________________________________
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, 21 July 2021 21:56
> To: Noah Goldstein
> Cc: Wilco Dijkstra; Lucas A. M. Magalhaes; GNU C Library
> Subject: RE: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
>
> Hi Noah,
>
> Thank you for the review.
>
> > > +#define TEST_MAIN
> > > +#define TEST_NAME "memset"
> > > +#define START_SIZE (16 * 1024)
> > > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > > +#define TIMEOUT (20 * 60)
> > > +#include "bench-string.h"
> > > +
> > > +#include "json-lib.h"
> > > +
> > > +void *generic_memset (void *, int, size_t);
> > > +typedef void *(*proto_t) (void *, int, size_t);
> > > +
> > > +IMPL (MEMSET, 1)
> > > +IMPL (generic_memset, 0)
> > > +
> > > +static void
> > Do we want __attribute__((noinline, noclone))?
>
> Yes, I'll add it.
>
> > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > > +            int c1 __attribute ((unused)), int c2 __attribute
> ((unused)),
> > > +            size_t n)
> > > +{
> > > +  size_t i, iters = 16;
> >
> > I think 16 is probably too few iterations for reliable benchmarking.
> > Maybe `INNER_LOOP_ITERS` which is 8192
>
> I tried it. If it is changed to 8192, it hit the TIMEOUT (20 * 60) on
> a64fx.
> Please check the code below.
>
> >
> > > +  timing_t start, stop, cur;
> > > +
> > > +  TIMING_NOW (start);
> > > +  for (i = 0; i < iters; i += 2)
> > > +    {
> > > +      CALL (impl, s, c1, n);
> > I am a bit worried that the overhead from the first call with `c1` will
> distort the results.
> > Is it possible to implement it with a nested loop where you fill `s`
> with `c1` for
> > `n * inner_loop_iterations` in the outer loop and in the inner loop fill
> `c2` on `s + n * i`?
> > In that case maybe 16 for inner loop iterations and 512 for outer loop
> iterations.
>
> It seems that we have to set smaller number if this implementation is not
> wrong.
> Because it will take 99.4 minutes estimating from the case that "iters =
> 32"
> took 23.3 seconds.
> (8192/32*23.3/60=99.4)

I see. I think 16 for the inner loop makes sense. From the x86_64
perspective this
will keep the loop from running out of the LSD which is necessary for
accurate
benchmarking. I guess then somewhere between [2, 8] is reasonable for the
outer
loop?


> #define START_SIZE (16 * 1024)
> ...
> static void
> __attribute__((noinline, noclone))
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
>              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
>              size_t n)
> {
>   size_t i, j, iters = INNER_LOOP_ITERS; // 32;
>   timing_t start, stop, cur, latency = 0;
>
>   for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
>     {

      CALL (impl, s, c1, n * 16);
>       TIMING_NOW (start);
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
>       TIMING_NOW (stop);
>       TIMING_DIFF (cur, start, stop);
>       TIMING_ACCUM (latency, cur);
>     }
>
This looks good. But as you said, a much smaller value for outer loop.

>
>   json_element_double (json_ctx, (double) latency / (double) iters);
> }
>
> > > +      CALL (impl, s, c2, n);
> > > +    }
> > > +  TIMING_NOW (stop);
> > > +
> > > +  TIMING_DIFF (cur, start, stop);
> > > +
> > > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > > +}
> > > +
> > > +static void
> > > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t
> len)
> > > +{
> > > +  align &= 63;
> > Can you make this `align &= getpagesize () - 1;`?
>
> I'll change it.
>
> Thanks.
> Naohiro
>
  
Wilco Dijkstra July 21, 2021, 7:17 p.m. UTC | #5
Hi,

      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
 
This loop is basically equivalent to CALL (impl, s, c2, n * 16), so you
might as well change the outer loop to use a larger 'n'. The accuracy
will be bad unless 'n' is really large, and there is no way to improve it.

If you want to test zero/non-zero combinations accurately, store the
memset values in an array rather than using a single constant.

Cheers,
Wilco
  
develop--- via Libc-alpha July 26, 2021, 8:39 a.m. UTC | #6
Hi Noah,

> I see. I think 16 for the inner loop makes sense. From the x86_64
> perspective this
> will keep the loop from running out of the LSD which is necessary for
> accurate
> benchmarking. I guess then somewhere between [2, 8] is reasonable for the
> outer
> loop?
> 
> 
> > #define START_SIZE (16 * 1024)
> > ...
> > static void
> > __attribute__((noinline, noclone))
> > do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> >              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> >              size_t n)
> > {
> >   size_t i, j, iters = INNER_LOOP_ITERS; // 32;
> >   timing_t start, stop, cur, latency = 0;
> >
> >   for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
> >     {
> >
> >       CALL (impl, s, c1, n * 16);
> >       TIMING_NOW (start);
> >       for (j = 0; j < 16; j++)
> >         CALL (impl, s + n * j, c2, n);
> >       TIMING_NOW (stop);
> >       TIMING_DIFF (cur, start, stop);
> >       TIMING_ACCUM (latency, cur);
> >     }
> >
> This looks good. But as you said, a much smaller value for outer loop.

I made one improvement that replaced 
  CALL (impl, s, c1, n * 16);
to
  __builtin_memset (s, c1, n * 16);
and tentatively chose outer loop two times such as the followings:

-----
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 2; i++)
    {
      __builtin_memset (s, c1, n * 16);
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}
-----

In case of __memset_generic on a64fx, execution of outer loop 8times
and 2times took as follows:

8times
real    0m26.236s
user    0m18.806s
sys     0m6.562s

2times
real    0m12.956s
user    0m5.081s
sys     0m6.594s

The performance difference is shown in a comparison graph [1],
there is a difference at 16KB.
This difference would not be critical if we use the performance data
mainly to compare "before" with "after" such as master version of
memset with patched version of memset.


This graph[1] can be drawn as the following:

$ cat 2times/bench-memset-zerofill.out 8times/bench-memset-zerofill.out | \
> merge_strings4graph.sh __memset_generic 2times 8times | \
> plot_strings.py -l -p thru -v -


In order to use __builtin_memset() and create the comparison graph [1],
I submitted two ground work patches [2][3].

[1] https://drive.google.com/file/d/1vD1VE3pdHLoYdaAMWXtImvDlGFDHYkyx/view?usp=sharing
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/129459.html
[3] https://sourceware.org/pipermail/libc-alpha/2021-July/129460.html

Thanks.
Naohiro
  
develop--- via Libc-alpha July 26, 2021, 8:42 a.m. UTC | #7
Hi Wilco,

> 
>       TIMING_NOW (start);
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
>       TIMING_NOW (stop);
>  
> This loop is basically equivalent to CALL (impl, s, c2, n * 16), so you

Yes, but the number of function call is different between 1 time and
16 times.

> might as well change the outer loop to use a larger 'n'. The accuracy
> will be bad unless 'n' is really large, and there is no way to improve it.

Umm I couldn't understand the logic of this part.
How do we change the the outer loop to use a larger 'n'?

> If you want to test zero/non-zero combinations accurately, store the
> memset values in an array rather than using a single constant.

I changed one line as shown in the mail [1]
from
  CALL (impl, s, c1, n * 16);
to
  __builtin_memset (s, c1, n * 16);

Is this the array you mentioned?

[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129461.html

Thanks.
Naohiro
  
Wilco Dijkstra July 26, 2021, 11:15 a.m. UTC | #8
Hi Naohiro,

>> This loop is basically equivalent to CALL (impl, s, c2, n * 16), so you
>
> Yes, but the number of function call is different between 1 time and
> 16 times.

The call overhead is not an issue unless 'n' is really small. The point is that
the loop writes 16 * n bytes, so you're really testing 16n rather than n.

>> might as well change the outer loop to use a larger 'n'. The accuracy
>> will be bad unless 'n' is really large, and there is no way to improve it.
>
> Umm I couldn't understand the logic of this part.
> How do we change the the outer loop to use a larger 'n'?

By removing the * 16 from the inner loop and adding it to the outer loop.
That avoids the confusion that we are testing size 'n' when we are really
testing n*16.

>  CALL (impl, s, c1, n * 16);
> to
>   __builtin_memset (s, c1, n * 16);
>
> Is this the array you mentioned?

That doesn't make any sense since that will just call memset and use the
default ifunc for memset.

What I mean is something trivial like: CALL (impl, s, memset_array[i & 15], n);
This way you can test any kind of pattern (like all zero, all one, and combinations
with varying number of zero->non-zero and non-zero->zero transitions).

Cheers,
Wilco
  
Noah Goldstein July 26, 2021, 5:22 p.m. UTC | #9
On Mon, Jul 26, 2021 at 4:39 AM naohirot@fujitsu.com <naohirot@fujitsu.com>
wrote:

> Hi Noah,
>
> > I see. I think 16 for the inner loop makes sense. From the x86_64
> > perspective this
> > will keep the loop from running out of the LSD which is necessary for
> > accurate
> > benchmarking. I guess then somewhere between [2, 8] is reasonable for the
> > outer
> > loop?
> >
> >
> > > #define START_SIZE (16 * 1024)
> > > ...
> > > static void
> > > __attribute__((noinline, noclone))
> > > do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > >              int c1 __attribute ((unused)), int c2 __attribute
> ((unused)),
> > >              size_t n)
> > > {
> > >   size_t i, j, iters = INNER_LOOP_ITERS; // 32;
> > >   timing_t start, stop, cur, latency = 0;
> > >
> > >   for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
> > >     {
> > >
> > >       CALL (impl, s, c1, n * 16);
> > >       TIMING_NOW (start);
> > >       for (j = 0; j < 16; j++)
> > >         CALL (impl, s + n * j, c2, n);
> > >       TIMING_NOW (stop);
> > >       TIMING_DIFF (cur, start, stop);
> > >       TIMING_ACCUM (latency, cur);
> > >     }
> > >
> > This looks good. But as you said, a much smaller value for outer loop.
>
> I made one improvement that replaced
>   CALL (impl, s, c1, n * 16);
> to
>   __builtin_memset (s, c1, n * 16);
> and tentatively chose outer loop two times such as the followings:
>
> -----
> static void
> __attribute__((noinline, noclone))
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
>              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
>              size_t n)
> {
>   size_t i, j, iters = 32;
>   timing_t start, stop, cur, latency = 0;
>
>   for (i = 0; i < 2; i++)
>     {
>       __builtin_memset (s, c1, n * 16);
>       TIMING_NOW (start);
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
>       TIMING_NOW (stop);
>       TIMING_DIFF (cur, start, stop);
>       TIMING_ACCUM (latency, cur);
>     }
>
>   json_element_double (json_ctx, (double) latency / (double) iters);
> }
>

Looks good!

> -----
>
In case of __memset_generic on a64fx, execution of outer loop 8times
> and 2times took as follows:
>
> 8times
> real    0m26.236s
> user    0m18.806s
> sys     0m6.562s
>
> 2times
> real    0m12.956s
> user    0m5.081s
> sys     0m6.594s
>
> The performance difference is shown in a comparison graph [1],
> there is a difference at 16KB.
> This difference would not be critical if we use the performance data
> mainly to compare "before" with "after" such as master version of
> memset with patched version of memset.
>
>
> This graph[1] can be drawn as the following:
>
> $ cat 2times/bench-memset-zerofill.out 8times/bench-memset-zerofill.out | \
> > merge_strings4graph.sh __memset_generic 2times 8times | \
> > plot_strings.py -l -p thru -v -
>
>
> In order to use __builtin_memset() and create the comparison graph [1],
> I submitted two ground work patches [2][3].
>
> [1]
> https://drive.google.com/file/d/1vD1VE3pdHLoYdaAMWXtImvDlGFDHYkyx/view?usp=sharing
> [2] https://sourceware.org/pipermail/libc-alpha/2021-July/129459.html
> [3] https://sourceware.org/pipermail/libc-alpha/2021-July/129460.html
>
> Thanks.
> Naohiro
>
  
develop--- via Libc-alpha July 27, 2021, 2:24 a.m. UTC | #10
Hi Wilco,

Thank you for the explanation.

> >> might as well change the outer loop to use a larger 'n'. The accuracy
> >> will be bad unless 'n' is really large, and there is no way to improve it.
> >
> > Umm I couldn't understand the logic of this part.
> > How do we change the the outer loop to use a larger 'n'?
> 
> By removing the * 16 from the inner loop and adding it to the outer loop.
> That avoids the confusion that we are testing size 'n' when we are really
> testing n*16.

There may be miscomminuation.
The * 16 is already in the outer loop (1).
Let me copy the code from the mail [1] I put in the previouse mail [2].

-----
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 2; i++)
    {
      __builtin_memset (s, c1, n * 16);  // (1)
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}
-----

[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129461.html
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/129462.html

BTW, are you thinking the code like this?
But it must be not, because there is no inner and outer loops.

  for (i = 0; i < 16; i++)
    {
      __builtin_memset (s, c1, n);

      TIMING_NOW (start);
      CALL (impl, s, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

> 
> >  CALL (impl, s, c1, n * 16);
> > to
> >   __builtin_memset (s, c1, n * 16);
> >
> > Is this the array you mentioned?
> 
> That doesn't make any sense since that will just call memset and use the
> default ifunc for memset.

This is my intention, because "CALL (impl, s, c1, n * 16);" is not
measured, that is outside of "TIMING_NOW (start);" and "TIMING_NOW (stop);". 
It doesn't matter what kind of memset is called, but matters the
function name in the code so that we can understand it is not mesured.

> 
> What I mean is something trivial like: CALL (impl, s, memset_array[i & 15], n);
> This way you can test any kind of pattern (like all zero, all one, and combinations
> with varying number of zero->non-zero and non-zero->zero transitions).

I understood, thanks.
Why don't we separate it to another patch if it is really matter?

From AArch64 point of view, the purpose of this bench is to measure
"DC ZVA" performance. So non-zero value can be any value except zero.
Do we have any specific reason to vary the non-zero value?

Thanks.
Naohiro
  
Wilco Dijkstra July 27, 2021, 5:26 p.m. UTC | #11
Hi Naohiro,

> There may be miscomminuation.
> The * 16 is already in the outer loop (1).

The outer loop is in test_main, and it determines 'n' in do_one_test:

  for (i = ...)
    {
      do_test (&json_ctx, 0, c, i);
    }

> Let me copy the code from the mail [1] I put in the previouse mail [2].

The key issue is that this loop:

      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);

is equivalent to:

CALL (impl, s, c2, n * 16);

The loop we really want is something like bench-memset-large:

  CALL (impl, s, c, n);
  TIMING_NOW (start);
  for (i = 0; i < iters; ++i)
    {
      CALL (impl, s, c, n);
    }
  TIMING_NOW (stop);

This repeats CALL on data of size 'n' after an initial warmup of the caches.

> It doesn't matter what kind of memset is called, but matters the
> function name in the code so that we can understand it is not mesured.

Then using the standard name 'memset' would be best.

>> What I mean is something trivial like: CALL (impl, s, memset_array[i & 15], n);
>> This way you can test any kind of pattern (like all zero, all one, and combinations
>> with varying number of zero->non-zero and non-zero->zero transitions).
>
> I understood, thanks.
> Why don't we separate it to another patch if it is really matter?

I don't think it matters, however I thought that is what your loops try to
measure? If not, then why not use the loop from bench-memset-large?

> From AArch64 point of view, the purpose of this bench is to measure
> "DC ZVA" performance. So non-zero value can be any value except zero.
> Do we have any specific reason to vary the non-zero value?

Well if that is the goal then bench-memset-large can measure zero performance
with minor changes. If you don't need to do anything completely different then
the existing code is good enough.

Cheers,
Wilco
  
develop--- via Libc-alpha July 28, 2021, 7:27 a.m. UTC | #12
Hi Wilco, Noah,

> > There may be miscomminuation.
> > The * 16 is already in the outer loop (1).
> 
> The outer loop is in test_main, and it determines 'n' in do_one_test:
> 
>   for (i = ...)
>     {
>       do_test (&json_ctx, 0, c, i);
>     }
> 
> > Let me copy the code from the mail [1] I put in the previouse mail [2].
> 
> The key issue is that this loop:
> 
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
> 
> is equivalent to:
> 
> CALL (impl, s, c2, n * 16);
> 
> The loop we really want is something like bench-memset-large:
> 
>   CALL (impl, s, c, n);
>   TIMING_NOW (start);
>   for (i = 0; i < iters; ++i)
>     {
>       CALL (impl, s, c, n);
>     }
>   TIMING_NOW (stop);
> 
> This repeats CALL on data of size 'n' after an initial warmup of the caches.
> 
> > It doesn't matter what kind of memset is called, but matters the
> > function name in the code so that we can understand it is not mesured.
> 
> Then using the standard name 'memset' would be best.
> 

OK, I understood, thanks.

Taking Noah's comment [1] into account, the final code should be like
the below. Can we agree with this code?

Two results, two loop version in the mail [1] and one loop version
below, are almost same in case of __memset_generic on a64fx as
shown in the graph [2].

-----
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, iters = 32;
  timing_t start, stop, cur, latency = 0;

  CALL (impl, s, c2, n); // warm up

  for (i = 0; i < iters; i++)
    {
      memset (s, c1, n); // alternation

      TIMING_NOW (start);

      CALL (impl, s, c2, n);

      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}
-----

[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129486.html
[2] https://drive.google.com/file/d/1bptHqg5vvFAGoYgoR3w_pvclXFSP8Sr0/view?usp=sharing

Thanks.
Naohiro
  
develop--- via Libc-alpha Aug. 4, 2021, 9:11 a.m. UTC | #13
Hi Wilco, Noah,

> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, July 28, 2021 4:28 PM
> 
> Taking Noah's comment [1] into account, the final code should be like
> the below. Can we agree with this code?
> 
> Two results, two loop version in the mail [1] and one loop version
> below, are almost same in case of __memset_generic on a64fx as
> shown in the graph [2].
> 
> -----
> static void
> __attribute__((noinline, noclone))
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
>              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
>              size_t n)
> {
>   size_t i, iters = 32;
>   timing_t start, stop, cur, latency = 0;
> 
>   CALL (impl, s, c2, n); // warm up
> 
>   for (i = 0; i < iters; i++)
>     {
>       memset (s, c1, n); // alternation
> 
>       TIMING_NOW (start);
> 
>       CALL (impl, s, c2, n);
> 
>       TIMING_NOW (stop);
>       TIMING_DIFF (cur, start, stop);
>       TIMING_ACCUM (latency, cur);
>     }
> 
>   json_element_double (json_ctx, (double) latency / (double) iters);
> }
> -----

I'd like to share an interesting insight which was found when
START_SIZE was changed to smaller size 256 from 16KB.
Currently DC ZVA is called if size is more than 256B and value is zero
in __memset_generic (sysdeps/aarch64/memset.S).
However DC ZVA is slower than store instruction if size is less than
16KB on A64FX[3].
So this would indicate that the appropriate DC ZVA start size might
be different on each CPU.
It would be interesting to see how other CPU behaves.

The code is below, which measures 4 patterns, zero-over-zero,
zero-over-one, one-over-zero and one-over-one from 256B to 64MB.
In the graph [3], 4 patterns are abbreviated 0o0, 0o1, 1o0 and 1o1.


#define START_SIZE 256
#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)

  for (c1 = 0; c1 < 2; c1++)
    for (c2 = 0; c2 < 2; c2++)
      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
        {
          do_test (&json_ctx, 0, c1, c2, i);
          do_test (&json_ctx, 3, c1, c2, i);
        }

I'd like to submit V3 patch incorporating above change too.

[3] https://drive.google.com/file/d/1fonjDDlF4LPLfZY9-z22DGn-yaSpGN4g/view?usp=sharing

Thanks.
Naohiro

> [1] https://sourceware.org/pipermail/libc-alpha/2021-July/129486.html
> [2] https://drive.google.com/file/d/1bptHqg5vvFAGoYgoR3w_pvclXFSP8Sr0/view?usp=sharing
> 
> Thanks.
> Naohiro
  

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..21b95c736190 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,7 @@  string-benchset := memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll memcpy-large memcpy-random memmove-large memset-large \
-		   memcpy-walk memset-walk memmove-walk
+		   memcpy-walk memset-walk memmove-walk memset-zerofill
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..2579b6edd09e
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,128 @@ 
+/* Measure memset functions with zero fill data.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE (16 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
+	     size_t n)
+{
+  size_t i, iters = 16;
+  timing_t start, stop, cur;
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; i += 2)
+    {
+      CALL (impl, s, c1, n);
+      CALL (impl, s, c2, n);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
+{
+  align &= 63;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char1", c1);
+  json_attr_int (json_ctx, "char2", c2);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c1, c2;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  c2 = 0;
+  for (c1 = 0; c1 < 2; c1++)
+    for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+      {
+	do_test (&json_ctx, 0, c1, c2, i);
+	do_test (&json_ctx, 3, c1, c2, i);
+      }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>