Add random memcpy test

Message ID DB3PR08MB0089596570EFB077FB73B79783590@DB3PR08MB0089.eurprd08.prod.outlook.com
State Superseded
Headers

Commit Message

Wilco Dijkstra June 3, 2016, 12:34 p.m. UTC
  ping
  

Comments

Siddhesh Poyarekar June 3, 2016, 1:09 p.m. UTC | #1
On Fri, Jun 03, 2016 at 12:34:55PM +0000, Wilco Dijkstra wrote:
> This patch adds a new memcpy test that uses small copy sizes using random alignment
> and size. The copy size is based on a preset distribution that favors smaller sizes and multiples
> of 4, 8 and 16.  Instead of repeating the same copy over and over again like the existing
> tests, it times several thousand different copies to more accurately estimate the overhead
> of branch prediction.

We need to have a deeper discussion on what the string benchmark tests
are meant to achieve.  We have agreed in the past that the current set
of tests are not useful enough and that we need to rework them.

To begin with, please add more comments in the test on why you have
chosen the distribution you have and also some comments on the effects
you think the distribution may have on other key factors such as cache
locality.  If you have come across this distribution in some workload,
it might give more relevance to this test since then we know that
improving on this benchmark gives a decent likelihood of improving
performance on a known and perhaps useful workload.

Siddhesh

PS: Whatever happened to the whole system benchmarking project?

> 
> OK for commit?
> 
> ChangeLog:
> 2016-05-11  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>         * benchtests/Makefile (string-benchset): Add memcpy-random.
>         * benchtests/bench-memcpy-random.c: New file.
> 
> ---
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 61077ea9b6f7d4c342192429a8d90ecdf9bdaea7..03311dd72856bf0e595a759b817cb772f0fd3a6f 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -38,7 +38,7 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
>                    strcat strchr strchrnul strcmp strcpy strcspn strlen \
>                    strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
>                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> -                  strcoll memcpy-large memmove-large memset-large
> +                  strcoll memcpy-large memcpy-random memmove-large memset-large
>  wcsmbs-benchset := wcslen wcsnlen wcscpy wcpcpy wcsncpy wcpncpy wcscat wcsncat \
>                    wcscmp wcsncmp wcschr wcschrnul wcsrchr wcsspn wcspbrk wcscspn \
>                    wmemchr wmemset wmemcmp
> diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..668d6a1d35074f4227be4e1ff424da556a377cef
> --- /dev/null
> +++ b/benchtests/bench-memcpy-random.c
> @@ -0,0 +1,130 @@
> +/* Measure memcpy functions.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define MIN_PAGE_SIZE 131072
> +#define TEST_MAIN
> +#define TEST_NAME "memcpy-random"
> +#include "bench-string.h"
> +
> +IMPL (memcpy, 0)
> +
> +#define NUM_COPIES 2048
> +#define NUM_DISTR  1024
> +
> +typedef struct
> +{
> +  uint16_t src;
> +  uint16_t dst;
> +  uint16_t len;
> +} copy_t;
> +
> +static copy_t copy[NUM_COPIES];
> +static uint8_t copy_distribution[NUM_DISTR];
> +
> +typedef char *(*proto_t) (char *, const char *, size_t);
> +
> +
> +static void
> +init_copy_distribution (void)
> +{
> +  int i, n, pos = 0;
> +  for (i = 0; i < 256; i++)
> +    {
> +      if (i < 8)
> +       n = 1;
> +      else if (i < 16)
> +       n = 8;
> +      else if (i < 32)
> +       n = 6;
> +      else if (i < 64)
> +       n = 4;
> +      else if (i < 128)
> +       n = 2;
> +      else
> +       n = 1;
> +
> +      if ((i & 15) == 0)
> +       n = n * 7;
> +      else if ((i & 7) == 0)
> +       n = n * 5;
> +      else if ((i & 3) == 0)
> +       n = n * 3;
> +
> +      for ( ; n > 0 && pos < NUM_DISTR; n--)
> +       copy_distribution[pos++] = i;
> +    }
> +  for ( ; pos < NUM_DISTR; pos++)
> +    copy_distribution[pos] = 255;
> +}
> +
> +static void
> +do_one_test (impl_t *impl, char *dst, char *src, copy_t *copy, size_t n)
> +{
> +  timing_t start, stop, cur;
> +  size_t iters = INNER_LOOP_ITERS * 20;
> +
> +  TIMING_NOW (start);
> +  for (int i = 0; i < iters; ++i)
> +    for (int j = 0; j < n; j++)
> +      CALL (impl, dst + copy[j].dst, src + copy[j].src, copy[j].len);
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +}
> +
> +static void
> +do_test (size_t max_size)
> +{
> +  for (int i = 0; i < max_size; i++)
> +    buf1[i] = i * 3;
> +
> +  for (int i = 0; i < NUM_COPIES; i++)
> +    {
> +      copy[i].dst = rand () & (max_size - 1);
> +      copy[i].src = rand () & (max_size - 1);
> +      copy[i].len = copy_distribution[rand () & (NUM_DISTR - 1)];
> +    }
> +
> +  printf ("Memory size %6zd:", max_size);
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    do_one_test (impl, (char *) buf2, (char *) buf1, copy, NUM_COPIES);
> +
> +  putchar ('\n');
> +}
> +
> +int
> +test_main (void)
> +{
> +  test_init ();
> +  init_copy_distribution ();
> +
> +  printf ("%23s", "");
> +  FOR_EACH_IMPL (impl, 0)
> +    printf ("\t%s", impl->name);
> +  putchar ('\n');
> +
> +  for (int i = 4; i <= 64; i = i * 2)
> +    do_test (i * 1024);
> +
> +  return ret;
> +}
> +
> +#include "../test-skeleton.c"
>
  
Carlos O'Donell June 3, 2016, 5:11 p.m. UTC | #2
On 06/03/2016 09:09 AM, Siddhesh Poyarekar wrote:
> PS: Whatever happened to the whole system benchmarking project?

It's growing slowly out of DJ's malloc benchmarking project? :-)

On dj/malloc we have the ability to:

- Trace malloc with minimal overhead during an application run.
- Save traces of stored data.
- Re-run trace in model simulator.
  - Allows you to play locally with malloc changes and see
    how they impact the saved workload.

The problem is that this scales very slowly API by API.

The whole system benchmarking project idea was that we would
take traces of the whole system and then feed that data back
as a workload that glibc can be tuned against.

We have scaled it back to 1 process from 1 system, and to
1 API from the whole library.
  
Siddhesh Poyarekar June 7, 2016, 12:31 p.m. UTC | #3
On Fri, Jun 03, 2016 at 01:11:59PM -0400, Carlos O'Donell wrote:
> - Trace malloc with minimal overhead during an application run.
> - Save traces of stored data.
> - Re-run trace in model simulator.
>   - Allows you to play locally with malloc changes and see
>     how they impact the saved workload.
> 
> The problem is that this scales very slowly API by API.
> 
> The whole system benchmarking project idea was that we would
> take traces of the whole system and then feed that data back
> as a workload that glibc can be tuned against.
> 
> We have scaled it back to 1 process from 1 system, and to
> 1 API from the whole library.

My understanding is that is what we agreed on last year at the
Cauldron.  If the workload you have is relevant (e.g. libvirtd with a
guest running and doing a live migration of a 10GB VM) then it would
be a great idea to have that included in the microbenchmark.  In
future then we could encourage similar workloads to build up in here
and then move forward from there.

Siddhesh
  
Carlos O'Donell June 8, 2016, 7:13 p.m. UTC | #4
On 06/07/2016 08:31 AM, Siddhesh Poyarekar wrote:
> On Fri, Jun 03, 2016 at 01:11:59PM -0400, Carlos O'Donell wrote:
>> - Trace malloc with minimal overhead during an application run.
>> - Save traces of stored data.
>> - Re-run trace in model simulator.
>>   - Allows you to play locally with malloc changes and see
>>     how they impact the saved workload.
>>
>> The problem is that this scales very slowly API by API.
>>
>> The whole system benchmarking project idea was that we would
>> take traces of the whole system and then feed that data back
>> as a workload that glibc can be tuned against.
>>
>> We have scaled it back to 1 process from 1 system, and to
>> 1 API from the whole library.
> 
> My understanding is that is what we agreed on last year at the
> Cauldron.  If the workload you have is relevant (e.g. libvirtd with a
> guest running and doing a live migration of a 10GB VM) then it would
> be a great idea to have that included in the microbenchmark.  In
> future then we could encourage similar workloads to build up in here
> and then move forward from there.

OK.

The only workloads we have right now are a 389-ds workload (ldap multi-client
single server) and some qemu virt workloads (fio). I'm trying to get my
hands on a FreeIPA workload, along with a gluster workload. I'm trying to
cover a few of the key pain points: disk, vm, idm etc.

We are working on getting more workloads, recording them, and then making
them available as stored data to replay in the workload simulator.
  

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 61077ea9b6f7d4c342192429a8d90ecdf9bdaea7..03311dd72856bf0e595a759b817cb772f0fd3a6f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -38,7 +38,7 @@  string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
                   strcat strchr strchrnul strcmp strcpy strcspn strlen \
                   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
                   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
-                  strcoll memcpy-large memmove-large memset-large
+                  strcoll memcpy-large memcpy-random memmove-large memset-large
 wcsmbs-benchset := wcslen wcsnlen wcscpy wcpcpy wcsncpy wcpncpy wcscat wcsncat \
                   wcscmp wcsncmp wcschr wcschrnul wcsrchr wcsspn wcspbrk wcscspn \
                   wmemchr wmemset wmemcmp
diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
new file mode 100644
index 0000000000000000000000000000000000000000..668d6a1d35074f4227be4e1ff424da556a377cef
--- /dev/null
+++ b/benchtests/bench-memcpy-random.c
@@ -0,0 +1,130 @@ 
+/* Measure memcpy functions.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MIN_PAGE_SIZE 131072
+#define TEST_MAIN
+#define TEST_NAME "memcpy-random"
+#include "bench-string.h"
+
+IMPL (memcpy, 0)
+
+#define NUM_COPIES 2048
+#define NUM_DISTR  1024
+
+typedef struct
+{
+  uint16_t src;
+  uint16_t dst;
+  uint16_t len;
+} copy_t;
+
+static copy_t copy[NUM_COPIES];
+static uint8_t copy_distribution[NUM_DISTR];
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+
+static void
+init_copy_distribution (void)
+{
+  int i, n, pos = 0;
+  for (i = 0; i < 256; i++)
+    {
+      if (i < 8)
+       n = 1;
+      else if (i < 16)
+       n = 8;
+      else if (i < 32)
+       n = 6;
+      else if (i < 64)
+       n = 4;
+      else if (i < 128)
+       n = 2;
+      else
+       n = 1;
+
+      if ((i & 15) == 0)
+       n = n * 7;
+      else if ((i & 7) == 0)
+       n = n * 5;
+      else if ((i & 3) == 0)
+       n = n * 3;
+
+      for ( ; n > 0 && pos < NUM_DISTR; n--)
+       copy_distribution[pos++] = i;
+    }
+  for ( ; pos < NUM_DISTR; pos++)
+    copy_distribution[pos] = 255;
+}
+
+static void
+do_one_test (impl_t *impl, char *dst, char *src, copy_t *copy, size_t n)
+{
+  timing_t start, stop, cur;
+  size_t iters = INNER_LOOP_ITERS * 20;
+
+  TIMING_NOW (start);
+  for (int i = 0; i < iters; ++i)
+    for (int j = 0; j < n; j++)
+      CALL (impl, dst + copy[j].dst, src + copy[j].src, copy[j].len);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t max_size)
+{
+  for (int i = 0; i < max_size; i++)
+    buf1[i] = i * 3;
+
+  for (int i = 0; i < NUM_COPIES; i++)
+    {
+      copy[i].dst = rand () & (max_size - 1);
+      copy[i].src = rand () & (max_size - 1);
+      copy[i].len = copy_distribution[rand () & (NUM_DISTR - 1)];
+    }
+
+  printf ("Memory size %6zd:", max_size);
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (impl, (char *) buf2, (char *) buf1, copy, NUM_COPIES);
+
+  putchar ('\n');
+}
+
+int
+test_main (void)
+{
+  test_init ();
+  init_copy_distribution ();
+
+  printf ("%23s", "");
+  FOR_EACH_IMPL (impl, 0)
+    printf ("\t%s", impl->name);
+  putchar ('\n');
+
+  for (int i = 4; i <= 64; i = i * 2)
+    do_test (i * 1024);
+
+  return ret;
+}
+
+#include "../test-skeleton.c"