From patchwork Thu Oct  5 16:55:34 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Siddhesh Poyarekar <siddhesh@sourceware.org>
X-Patchwork-Id: 23355
Received: (qmail 42694 invoked by alias); 5 Oct 2017 16:55:50 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 42677 invoked by uid 89); 5 Oct 2017 16:55:50 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-25.6 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE,
	RCVD_IN_SORBS_SPAM,
	SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=
X-HELO: homiemail-a51.g.dreamhost.com
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [COMMITTED 1/3] benchtests: Memory walking benchmark for memcpy
Date: Thu,  5 Oct 2017 22:25:34 +0530
Message-Id: <1507222536-3809-1-git-send-email-siddhesh@sourceware.org>

This benchmark is an attempt to eliminate cache effects from string
benchmarks.  The benchmark walks both ways through a large memory area
and copies different sizes of memory and alignments one at a time
instead of looping around in the same memory area.  This is a good
metric to have alongside the other memcpy benchmarks, especially for
larger sizes where the likelihood of the call being done only once is
pretty high.

	* benchtests/bench-memcpy-walk.c: New file.
	* benchtests/Makefile (string-benchset): Add it.
---
 ChangeLog                      |   5 ++
 benchtests/Makefile            |   3 +-
 benchtests/bench-memcpy-walk.c | 127 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memcpy-walk.c

diff --git a/ChangeLog b/ChangeLog
index 7ddff74..a86faeb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2017-10-05  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* benchtests/bench-memcpy-walk.c: New file.
+	* benchtests/Makefile (string-benchset): Add it.
+
 2017-10-05  Florian Weimer  <fweimer@redhat.com>
 
 	nscd: Eliminate compilation time dependency in the build output.
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 3acc39c..d086cc6 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -37,7 +37,8 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
 		   strcat strchr strchrnul strcmp strcpy strcspn strlen \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
-		   strcoll memcpy-large memcpy-random memmove-large memset-large
+		   strcoll memcpy-large memcpy-random memmove-large memset-large \
+		   memcpy-walk
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memcpy-walk.c b/benchtests/bench-memcpy-walk.c
new file mode 100644
index 0000000..69d467d
--- /dev/null
+++ b/benchtests/bench-memcpy-walk.c
@@ -0,0 +1,127 @@
+/* Measure memcpy function combined throughput for different alignments.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This microbenchmark measures the throughput of memcpy for various sizes from
+   1 byte to 32MiB, doubling every iteration and then misaligning by 0-15
+   bytes.  The copies are done from source to destination and then back and the
+   source walks forward across the array and the destination walks backward by
+   one byte each, thus measuring misaligned accesses as well.  The idea is to
+   avoid caching effects by copying a different string and far enough from each
+   other, walking in different directions so that we can measure prefetcher
+   efficiency (software or hardware) more closely than with a loop copying the
+   same data over and over, which eventually only gives us L1 cache
+   performance.  */
+
+#ifndef MEMCPY_RESULT
+# define MEMCPY_RESULT(dst, len) dst
+# define START_SIZE 1
+# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024)
+# define TEST_MAIN
+# define TEST_NAME "memcpy"
+# define TIMEOUT (20 * 60)
+# include "bench-string.h"
+
+IMPL (memcpy, 1)
+#endif
+
+#include "json-lib.h"
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
+	     size_t len)
+{
+  size_t i, iters = MIN_PAGE_SIZE / len;
+  timing_t start, stop, cur;
+
+  char *dst_end = dst + MIN_PAGE_SIZE - len;
+  char *src_end = src + MIN_PAGE_SIZE - len;
+
+  TIMING_NOW (start);
+  /* Copy the entire buffer back and forth, LEN at a time.  */
+  for (i = 0; i < iters && dst_end >= dst && src <= src_end; src++, dst_end--)
+    {
+      CALL (impl, dst_end, src, len);
+      CALL (impl, src, dst_end, len);
+      i += 2;
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  /* Get time taken per function call.  */
+  json_element_double (json_ctx, (double) cur * len / i);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t len)
+{
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", (double) len);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, "memcpy");
+  json_attr_string (&json_ctx, "bench-variant", "walk");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+    {
+      /* Test length alignments from 0-16 bytes.  */
+      for (int j = 0; j < 8; j++)
+	{
+	  do_test (&json_ctx, i + j);
+	  do_test (&json_ctx, i + 16 - j);
+	}
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>