[v3,1/2] x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
This is just a refactor and there should be no behavioral change from
this commit.
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
for controlling whether we use non-temporal memset rather than having
extra logic based on vendor.
---
sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
2 files changed, 23 insertions(+), 8 deletions(-)
Comments
On Tue, Aug 13, 2024 at 11:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is just a refactor and there should be no behavioral change from
> this commit.
>
> The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
> for controlling whether we use non-temporal memset rather than having
> extra logic based on vendor.
> ---
> sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
> sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
> 2 files changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 18ed008040..a4786d23c7 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features)
> unsigned int stepping = 0;
> enum cpu_features_kind kind;
>
> + /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
> + as of writing this, we only have benchmarks indicatings it profitability
> + on Intel/AMD. */
> + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> + |= bit_arch_Avoid_Non_Temporal_Memset;
> +
> cpu_features->cachesize_non_temporal_divisor = 4;
> #if !HAS_CPUID
> if (__get_cpuid_max (0, 0) == 0)
> @@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>
> update_active (cpu_features);
>
> + /* Benchmarks indicate non-temporal memset can be profitable on Intel
> + hardware. */
> + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> + &= ~bit_arch_Avoid_Non_Temporal_Memset;
> +
> if (family == 0x06)
> {
> model += extended_model;
> @@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
>
> ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
>
> + /* Benchmarks indicate non-temporal memset can be profitable on AMD
> + hardware. */
> + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> + &= ~bit_arch_Avoid_Non_Temporal_Memset;
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> {
> /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index a1c03b8903..3d0c8d43b8 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> rep_movsb_threshold = 2112;
>
> - /* Non-temporal stores are more performant on Intel and AMD hardware above
> - non_temporal_threshold. Enable this for both Intel and AMD hardware. */
> - unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> - && (cpu_features->basic.kind == arch_kind_intel
> - || cpu_features->basic.kind == arch_kind_amd))
> - memset_non_temporal_threshold = non_temporal_threshold;
> -
> /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> cases slower than the vectorized path (and for some alignments,
> it is really slow, check BZ #30994). */
> @@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (tunable_size != 0)
> shared = tunable_size;
>
> + /* Non-temporal stores are more performant on some hardware above
> + non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
> + Intel and AMD hardware. */
> + unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
> + memset_non_temporal_threshold = non_temporal_threshold;
> +
> tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
> if (tunable_size > minimum_non_temporal_threshold
> && tunable_size <= maximum_non_temporal_threshold)
> --
> 2.34.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
@@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+ as of writing this, we only have benchmarks indicatings it profitability
+ on Intel/AMD. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
@@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features)
update_active (cpu_features);
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (family == 0x06)
{
model += extended_model;
@@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
{
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
@@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores are more performant on Intel and AMD hardware above
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
- && (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd))
- memset_non_temporal_threshold = non_temporal_threshold;
-
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (tunable_size != 0)
shared = tunable_size;
+ /* Non-temporal stores are more performant on some hardware above
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+ Intel and AMD hardware. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
if (tunable_size > minimum_non_temporal_threshold
&& tunable_size <= maximum_non_temporal_threshold)