From patchwork Thu Oct 5 16:55:34 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Siddhesh Poyarekar X-Patchwork-Id: 23355 Received: (qmail 42694 invoked by alias); 5 Oct 2017 16:55:50 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 42677 invoked by uid 89); 5 Oct 2017 16:55:50 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-25.6 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, RCVD_IN_SORBS_SPAM, SPF_NEUTRAL autolearn=ham version=3.3.2 spammy= X-HELO: homiemail-a51.g.dreamhost.com From: Siddhesh Poyarekar To: libc-alpha@sourceware.org Subject: [COMMITTED 1/3] benchtests: Memory walking benchmark for memcpy Date: Thu, 5 Oct 2017 22:25:34 +0530 Message-Id: <1507222536-3809-1-git-send-email-siddhesh@sourceware.org> This benchmark is an attempt to eliminate cache effects from string benchmarks. The benchmark walks both ways through a large memory area and copies different sizes of memory and alignments one at a time instead of looping around in the same memory area. This is a good metric to have alongside the other memcpy benchmarks, especially for larger sizes where the likelihood of the call being done only once is pretty high. * benchtests/bench-memcpy-walk.c: New file. * benchtests/Makefile (string-benchset): Add it. --- ChangeLog | 5 ++ benchtests/Makefile | 3 +- benchtests/bench-memcpy-walk.c | 127 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 benchtests/bench-memcpy-walk.c diff --git a/ChangeLog b/ChangeLog index 7ddff74..a86faeb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2017-10-05 Siddhesh Poyarekar + + * benchtests/bench-memcpy-walk.c: New file. + * benchtests/Makefile (string-benchset): Add it. + 2017-10-05 Florian Weimer nscd: Eliminate compilation time dependency in the build output. diff --git a/benchtests/Makefile b/benchtests/Makefile index 3acc39c..d086cc6 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -37,7 +37,8 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \ strcat strchr strchrnul strcmp strcpy strcspn strlen \ strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \ strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \ - strcoll memcpy-large memcpy-random memmove-large memset-large + strcoll memcpy-large memcpy-random memmove-large memset-large \ + memcpy-walk # Build and run locale-dependent benchmarks only if we're building natively. ifeq (no,$(cross-compiling)) diff --git a/benchtests/bench-memcpy-walk.c b/benchtests/bench-memcpy-walk.c new file mode 100644 index 0000000..69d467d --- /dev/null +++ b/benchtests/bench-memcpy-walk.c @@ -0,0 +1,127 @@ +/* Measure memcpy function combined throughput for different alignments. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* This microbenchmark measures the throughput of memcpy for various sizes from + 1 byte to 32MiB, doubling every iteration and then misaligning by 0-15 + bytes. The copies are done from source to destination and then back and the + source walks forward across the array and the destination walks backward by + one byte each, thus measuring misaligned accesses as well. The idea is to + avoid caching effects by copying a different string and far enough from each + other, walking in different directions so that we can measure prefetcher + efficiency (software or hardware) more closely than with a loop copying the + same data over and over, which eventually only gives us L1 cache + performance. */ + +#ifndef MEMCPY_RESULT +# define MEMCPY_RESULT(dst, len) dst +# define START_SIZE 1 +# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024) +# define TEST_MAIN +# define TEST_NAME "memcpy" +# define TIMEOUT (20 * 60) +# include "bench-string.h" + +IMPL (memcpy, 1) +#endif + +#include "json-lib.h" + +typedef char *(*proto_t) (char *, const char *, size_t); + +static void +do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src, + size_t len) +{ + size_t i, iters = MIN_PAGE_SIZE / len; + timing_t start, stop, cur; + + char *dst_end = dst + MIN_PAGE_SIZE - len; + char *src_end = src + MIN_PAGE_SIZE - len; + + TIMING_NOW (start); + /* Copy the entire buffer back and forth, LEN at a time. */ + for (i = 0; i < iters && dst_end >= dst && src <= src_end; src++, dst_end--) + { + CALL (impl, dst_end, src, len); + CALL (impl, src, dst_end, len); + i += 2; + } + TIMING_NOW (stop); + + TIMING_DIFF (cur, start, stop); + + /* Get time taken per function call. */ + json_element_double (json_ctx, (double) cur * len / i); +} + +static void +do_test (json_ctx_t *json_ctx, size_t len) +{ + json_element_object_begin (json_ctx); + json_attr_uint (json_ctx, "length", (double) len); + json_array_begin (json_ctx, "timings"); + + FOR_EACH_IMPL (impl, 0) + do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len); + + json_array_end (json_ctx); + json_element_object_end (json_ctx); +} + +int +test_main (void) +{ + json_ctx_t json_ctx; + size_t i; + + test_init (); + + json_init (&json_ctx, 0, stdout); + + json_document_begin (&json_ctx); + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); + + json_attr_object_begin (&json_ctx, "functions"); + json_attr_object_begin (&json_ctx, "memcpy"); + json_attr_string (&json_ctx, "bench-variant", "walk"); + + json_array_begin (&json_ctx, "ifuncs"); + FOR_EACH_IMPL (impl, 0) + json_element_string (&json_ctx, impl->name); + json_array_end (&json_ctx); + + json_array_begin (&json_ctx, "results"); + for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1) + { + /* Test length alignments from 0-16 bytes. */ + for (int j = 0; j < 8; j++) + { + do_test (&json_ctx, i + j); + do_test (&json_ctx, i + 16 - j); + } + } + + json_array_end (&json_ctx); + json_attr_object_end (&json_ctx); + json_attr_object_end (&json_ctx); + json_document_end (&json_ctx); + + return ret; +} + +#include