memcpy performance regressions 2.19 -> 2.24(5)

Message ID CAMe9rOrwd=4D1_0V2Uey06xexgWkcp2xw=RAC0wGicC9X8GJLA@mail.gmail.com
State New, archived
Headers

Commit Message

H.J. Lu May 22, 2017, 8:22 p.m. UTC
  On Mon, May 22, 2017 at 12:17 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, May 18, 2017 at 1:59 PM, Erich Elsen <eriche@google.com> wrote:
>> Hi H.J.,
>>
>> I was on vacation, sorry for the slow reply.  The updated benchmark
>> still shows the same behavior, thanks.
>>
>> I'll try my hand at creating a patch that makes that variable
>> __x86_shared_non_temporal_threshold a tunable.  It will be necessary
>> to do internal experiments anyway.
>>
>
> __x86_shared_non_temporal_threshold was set to 6 times of per-core
> shared cache size, based on the large memcpy micro benchmark in glibc
> on a 8-core processor.  For a processor with more than 8 cores, the
> threshold is too low.  Set __x86_shared_non_temporal_threshold to the
> 3/4 of the total shared cache size so that it is unchanged on 8-core
> processors.  On processors with less than 8 cores, the threshold is
> lower.
>
> Any comments?
>

Here is a patch to add support for
"glibc.x86_cache.non_temporal_threshold=number"
to GLIBC_TUNABLES.
  

Patch

From 3e31bc4a930e7b32924befe762014f85d5408692 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 22 May 2017 12:00:43 -0700
Subject: [PATCH] Add x86_cache.non_temporal_threshold to GLIBC_TUNABLES

Add support for "glibc.x86_cache.non_temporal_threshold=number" to
GLIBC_TUNABLES.

	* elf/dl-tunables.list (x86_cache): New name space.
	* sysdeps/x86/cacheinfo.c [HAVE_TUNABLES] (TUNABLE_NAMESPACE):
	New.
	[HAVE_TUNABLES]: Include <elf/dl-tunables.h>.
	[HAVE_TUNABLES] (DL_TUNABLE_CALLBACK (set_non_temporal_threshold)):
	New.
	[HAVE_TUNABLES] (init_cacheinfo): Call TUNABLE_SET_VAL_WITH_CALLBACK
	with set_non_temporal_threshold.
---
 elf/dl-tunables.list    |  6 ++++++
 sysdeps/x86/cacheinfo.c | 22 +++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index b9f1488..2c899fe 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -77,4 +77,10 @@  glibc {
       security_level: SXID_IGNORE
     }
   }
+  x86_cache {
+    non_temporal_threshold {
+      type: SIZE_T
+      security_level: SXID_IGNORE
+    }
+  }
 }
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 3434d97..1b195eb 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -23,6 +23,20 @@ 
 #include <cpuid.h>
 #include <init-arch.h>
 
+/* Threshold to use non temporal store.  */
+long int __x86_shared_non_temporal_threshold attribute_hidden;
+
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE x86_cache
+# include <elf/dl-tunables.h>
+
+void
+DL_TUNABLE_CALLBACK (set_non_temporal_threshold) (tunable_val_t *valp)
+{
+  __x86_shared_non_temporal_threshold = (long int) valp->numval;
+}
+#endif
+
 #define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
 #define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
 #define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
@@ -466,9 +480,6 @@  long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
 /* Similar to __x86_shared_cache_size, but not rounded.  */
 long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 
-/* Threshold to use non temporal store.  */
-long int __x86_shared_non_temporal_threshold attribute_hidden;
-
 #ifndef DISABLE_PREFETCHW
 /* PREFETCHW support flag for use in memory and string routines.  */
 int __x86_prefetchw attribute_hidden;
@@ -770,4 +781,9 @@  intel_bug_no_cache_info:
      total shared cache size.  */
   __x86_shared_non_temporal_threshold
     = __x86_shared_cache_size * threads * 3 / 4;
+
+#if HAVE_TUNABLES
+  TUNABLE_SET_VAL_WITH_CALLBACK (non_temporal_threshold, NULL,
+				 set_non_temporal_threshold);
+#endif
 }
-- 
2.9.4