[v5,3/3] x86: Make the divisor in setting `non_temporal_threshold` cpu specific

Message ID 20230509031313.3497001-3-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v5,1/3] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4` |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein May 9, 2023, 3:13 a.m. UTC
  Different systems prefer a different divisors.

From benchmarks[1] so far the following divisors have been found:
    ICX     : 2
    SKX     : 2
    BWD     : 8

For Intel, we are generalizing that BWD and older prefers 8 as a
divisor, and SKL and newer prefers 2. This number can be further tuned
as benchmarks are run.

[1]: https://github.com/goldsteinn/memcpy-nt-benchmarks
---
 sysdeps/x86/cpu-features.c         | 11 ++++++----
 sysdeps/x86/dl-cacheinfo.h         | 32 ++++++++++++++++++------------
 sysdeps/x86/include/cpu-features.h |  3 +++
 3 files changed, 29 insertions(+), 17 deletions(-)
  

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index bec70c3c49..3c1a77906a 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -637,6 +637,7 @@  init_cpu_features (struct cpu_features *cpu_features)
   unsigned int stepping = 0;
   enum cpu_features_kind kind;
 
+  cpu_features->cachesize_non_temporal_divisor = 4;
 #if !HAS_CPUID
   if (__get_cpuid_max (0, 0) == 0)
     {
@@ -720,6 +721,8 @@  init_cpu_features (struct cpu_features *cpu_features)
 		break;
 	    case INTEL_BIGCORE_NEHALEM:
 	    case INTEL_BIGCORE_WESTMERE:
+	      /* Older CPUs prefer non-temporal stores at lower threshold.  */
+	      cpu_features->cachesize_non_temporal_divisor = 8;
 	      /* Rep string instructions, unaligned load, unaligned copy,
 		 and pminub are fast on Intel Core i3, i5 and i7.  */
 	      cpu_features->preferred[index_arch_Fast_Rep_String]
@@ -728,11 +731,12 @@  init_cpu_features (struct cpu_features *cpu_features)
 		      | bit_arch_Prefer_PMINUB_for_stringop);
 	      break;
 
-	      /* Untuned Bigcore microarch.  */
 	    case INTEL_BIGCORE_SANDYBRIDGE:
 	    case INTEL_BIGCORE_IVYBRIDGE:
 	    case INTEL_BIGCORE_HASWELL:
 	    case INTEL_BIGCORE_BROADWELL:
+	      cpu_features->cachesize_non_temporal_divisor = 8;
+	      break;
 	    case INTEL_BIGCORE_SKYLAKE:
 	    case INTEL_BIGCORE_AMBERLAKE:
 	    case INTEL_BIGCORE_COFFEELAKE:
@@ -753,11 +757,10 @@  init_cpu_features (struct cpu_features *cpu_features)
 	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
 	    case INTEL_BIGCORE_EMERALDRAPIDS:
 	    case INTEL_BIGCORE_GRANITERAPIDS:
-	      break;
-
-	    /* Untuned Mixed (bigcore + atom SOC).  */
+	    /* Mixed (bigcore + atom SOC).  */
 	    case INTEL_MIXED_LAKEFIELD:
 	    case INTEL_MIXED_ALDERLAKE:
+	      cpu_features->cachesize_non_temporal_divisor = 2;
 	      break;
 	    }
 
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index c7e41029fa..6225c852f6 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -738,19 +738,25 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   cpu_features->level3_cache_linesize = level3_cache_linesize;
   cpu_features->level4_cache_size = level4_cache_size;
 
-  /* The default setting for the non_temporal threshold is 1/4 of size
-     of the chip's cache. For most Intel and AMD processors with an
-     initial release date between 2017 and 2023, a thread's typical
-     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
-     estimate the point where non-temporal stores begin outcompeting
-     REP MOVSB. As well the point where the fact that non-temporal
-     stores are forced back to main memory would already occurred to the
-     majority of the lines in the copy. Note, concerns about the
-     entire L3 cache being evicted by the copy are mostly alleviated
-     by the fact that modern HW detects streaming patterns and
-     provides proper LRU hints so that the maximum thrashing
-     capped at 1/associativity. */
-  unsigned long int non_temporal_threshold = shared / 4;
+  unsigned long int cachesize_non_temporal_divisor
+      = cpu_features->cachesize_non_temporal_divisor;
+  if (cachesize_non_temporal_divisor <= 0)
+    cachesize_non_temporal_divisor = 4;
+
+  /* The default setting for the non_temporal threshold is [1/2, 1/8] of size
+     of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+     is microarch specific. The defeault is 1/4). For most Intel and AMD
+     processors with an initial release date between 2017 and 2023, a thread's
+     typical share of the cache is from 18-64MB. Using a reasonable size
+     fraction of L3 is meant to estimate the point where non-temporal stores
+     begin outcompeting REP MOVSB. As well the point where the fact that
+     non-temporal stores are forced back to main memory would already occurred
+     to the majority of the lines in the copy. Note, concerns about the entire
+     L3 cache being evicted by the copy are mostly alleviated by the fact that
+     modern HW detects streaming patterns and provides proper LRU hints so that
+     the maximum thrashing capped at 1/associativity. */
+  unsigned long int non_temporal_threshold
+      = shared / cachesize_non_temporal_divisor;
   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
      a higher risk of actually thrashing the cache as they don't have a HW LRU
      hint. As well, there performance in highly parallel situations is
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 40b8129d6a..f5b9dd54fe 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -915,6 +915,9 @@  struct cpu_features
   unsigned long int shared_cache_size;
   /* Threshold to use non temporal store.  */
   unsigned long int non_temporal_threshold;
+  /* When no user non_temporal_threshold is specified. We default to
+     cachesize / cachesize_non_temporal_divisor.  */
+  unsigned long int cachesize_non_temporal_divisor;
   /* Threshold to use "rep movsb".  */
   unsigned long int rep_movsb_threshold;
   /* Threshold to stop using "rep movsb".  */