x86: Add new cpu-flag `Prefer_Non_Temporal`
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
Commit Message
The goal of this flag is to allow targets which don't prefer/have ERMS
to still access the non-temporal memset implementation.
---
sysdeps/x86/cpu-tunables.c | 2 ++
sysdeps/x86/dl-cacheinfo.h | 28 +++++++++++++++++--
...cpu-features-preferred_feature_index_1.def | 1 +
sysdeps/x86/tst-hwcap-tunables.c | 4 +--
sysdeps/x86_64/multiarch/ifunc-memset.h | 15 ++++++----
5 files changed, 40 insertions(+), 10 deletions(-)
Comments
On Wed, Jul 10, 2024 at 2:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The goal of this flag is to allow targets which don't prefer/have ERMS
> to still access the non-temporal memset implementation.
> ---
> sysdeps/x86/cpu-tunables.c | 2 ++
> sysdeps/x86/dl-cacheinfo.h | 28 +++++++++++++++++--
> ...cpu-features-preferred_feature_index_1.def | 1 +
> sysdeps/x86/tst-hwcap-tunables.c | 4 +--
> sysdeps/x86_64/multiarch/ifunc-memset.h | 15 ++++++----
> 5 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> Fast_Unaligned_Load, 19);
> CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> Fast_Unaligned_Copy, 19);
> + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> + Prefer_Non_Temporal, 19);
> }
> break;
> case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> slightly better than ERMS. */
> rep_stosb_threshold = SIZE_MAX;
>
> + /*
> + * For memset, the non-temporal implementation is only accessed through the
> + * stosb code. ie:
> + * ```
> + * if (size >= rep_stosb_thresh)
> + * {
> + * if (size >= non_temporal_thresh)
> + * {
> + * do_non_temporal ();
> + * }
> + * do_stosb ();
> + * }
> + * do_normal_vec_loop ();
> + * ```
> + * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> + * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> + *`rep stosb` will never be used.
> + */
> + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> + memset_non_temporal_threshold,
> + minimum_non_temporal_threshold, SIZE_MAX);
> + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> + rep_stosb_threshold
> + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
> TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> minimum_non_temporal_threshold,
> maximum_non_temporal_threshold);
> - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> - memset_non_temporal_threshold,
> - minimum_non_temporal_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> minimum_rep_movsb_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> BIT (MathVec_Prefer_No_AVX512)
> BIT (Prefer_FSRM)
> BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
> /* Disable everything. */
> "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> - "-AVX_Fast_Unaligned_Load",
> + "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> test_1,
> array_length (test_1)
> },
> @@ -68,7 +68,7 @@ static const struct test_t
> /* Same as before, but with some empty suboptions. */
> ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> - "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> + "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> test_1,
> array_length (test_1)
> }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx512_unaligned_erms);
>
> return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (evex_unaligned_erms);
>
> return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx2_unaligned_erms_rtm);
>
> return OPTIMIZE (avx2_unaligned_rtm);
> @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
> if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> Prefer_No_VZEROUPPER, !))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx2_unaligned_erms);
>
> return OPTIMIZE (avx2_unaligned);
> }
> }
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (sse2_unaligned_erms);
>
> return OPTIMIZE (sse2_unaligned);
> --
> 2.34.1
>
NB: For after 2.40 release
> -----邮件原件-----
> 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> 发送时间: 2024年7月10日 14:52
> 收件人: libc-alpha@sourceware.org
> 抄送: goldstein.w.n@gmail.com; hjl.tools@gmail.com; Feifei Wang
> <wangfeifei@hygon.cn>
> 主题: x86: Add new cpu-flag `Prefer_Non_Temporal`
>
> The goal of this flag is to allow targets which don't prefer/have ERMS to still
> access the non-temporal memset implementation.
> ---
> sysdeps/x86/cpu-tunables.c | 2 ++
> sysdeps/x86/dl-cacheinfo.h | 28
> +++++++++++++++++--
> ...cpu-features-preferred_feature_index_1.def | 1 +
> sysdeps/x86/tst-hwcap-tunables.c | 4 +--
> sysdeps/x86_64/multiarch/ifunc-memset.h | 15 ++++++----
> 5 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index
> 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t
> *valp)
> Fast_Unaligned_Load, 19);
> CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> Fast_Unaligned_Copy, 19);
> + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> + Prefer_Non_Temporal, 19);
> }
> break;
> case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index
> 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
> slightly better than ERMS. */
> rep_stosb_threshold = SIZE_MAX;
>
> + /*
> + * For memset, the non-temporal implementation is only accessed through
> the
> + * stosb code. ie:
> + * ```
> + * if (size >= rep_stosb_thresh)
> + * {
> + * if (size >= non_temporal_thresh)
> + * {
> + * do_non_temporal ();
> + * }
> + * do_stosb ();
> + * }
> + * do_normal_vec_loop ();
> + * ```
> + * So if we prefer non-temporal, set `rep_stosb_thresh =
> non_temporal_thresh`
> + * to enable the implementation. If `rep_stosb_thresh =
> non_temporal_thresh`,
Maybe " If `rep_stosb_thresh == non_temporal_thresh`
> + *`rep stosb` will never be used.
> + */
> + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> + memset_non_temporal_threshold,
> + minimum_non_temporal_threshold, SIZE_MAX);
> + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> + rep_stosb_threshold
> + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
> TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0,
> SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold,
> non_temporal_threshold,
> minimum_non_temporal_threshold,
> maximum_non_temporal_threshold);
> - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> - memset_non_temporal_threshold,
> - minimum_non_temporal_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold,
> rep_movsb_threshold,
> minimum_rep_movsb_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold,
> rep_stosb_threshold, 1, diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) BIT
> (MathVec_Prefer_No_AVX512) BIT (Prefer_FSRM) BIT
> (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
> /* Disable everything. */
> "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> - "-AVX_Fast_Unaligned_Load",
> + "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> test_1,
> array_length (test_1)
> },
> @@ -68,7 +68,7 @@ static const struct test_t
> /* Same as before, but with some empty suboptions. */
> ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> - "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> + "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> test_1,
> array_length (test_1)
> }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h
> b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx512_unaligned_erms);
>
> return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (evex_unaligned_erms);
>
> return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx2_unaligned_erms_rtm);
>
> return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +96,16 @@
> IFUNC_SELECTOR (void)
> if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> Prefer_No_VZEROUPPER, !))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx2_unaligned_erms);
>
> return OPTIMIZE (avx2_unaligned);
> }
> }
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (sse2_unaligned_erms);
>
> return OPTIMIZE (sse2_unaligned);
> --
> 2.34.1
Reviewed-by: Feifei Wang <wangfeifei@hygon.cn>
On 10/07/24 03:52, Noah Goldstein wrote:
> The goal of this flag is to allow targets which don't prefer/have ERMS
> to still access the non-temporal memset implementation.
> ---
> sysdeps/x86/cpu-tunables.c | 2 ++
> sysdeps/x86/dl-cacheinfo.h | 28 +++++++++++++++++--
> ...cpu-features-preferred_feature_index_1.def | 1 +
> sysdeps/x86/tst-hwcap-tunables.c | 4 +--
> sysdeps/x86_64/multiarch/ifunc-memset.h | 15 ++++++----
> 5 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> Fast_Unaligned_Load, 19);
> CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> Fast_Unaligned_Copy, 19);
> + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> + Prefer_Non_Temporal, 19);
> }
> break;
> case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> slightly better than ERMS. */
> rep_stosb_threshold = SIZE_MAX;
>
> + /*
> + * For memset, the non-temporal implementation is only accessed through the
> + * stosb code. ie:
> + * ```
> + * if (size >= rep_stosb_thresh)
> + * {
> + * if (size >= non_temporal_thresh)
> + * {
> + * do_non_temporal ();
> + * }
> + * do_stosb ();
> + * }
> + * do_normal_vec_loop ();
> + * ```
> + * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> + * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> + *`rep stosb` will never be used.
> + */
Small nit, multi-line comments usually don't have the '*' for each line:
/*
<comment>
*/
> + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> + memset_non_temporal_threshold,
> + minimum_non_temporal_threshold, SIZE_MAX);
> + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> + rep_stosb_threshold
> + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
> TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> minimum_non_temporal_threshold,
> maximum_non_temporal_threshold);
> - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> - memset_non_temporal_threshold,
> - minimum_non_temporal_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> minimum_rep_movsb_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> BIT (MathVec_Prefer_No_AVX512)
> BIT (Prefer_FSRM)
> BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
> /* Disable everything. */
> "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> - "-AVX_Fast_Unaligned_Load",
> + "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> test_1,
> array_length (test_1)
> },
> @@ -68,7 +68,7 @@ static const struct test_t
> /* Same as before, but with some empty suboptions. */
> ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> - "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> + "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> test_1,
> array_length (test_1)
> }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx512_unaligned_erms);
>
> return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (evex_unaligned_erms);
>
> return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx2_unaligned_erms_rtm);
>
> return OPTIMIZE (avx2_unaligned_rtm);
> @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
> if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> Prefer_No_VZEROUPPER, !))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (avx2_unaligned_erms);
>
> return OPTIMIZE (avx2_unaligned);
> }
> }
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> return OPTIMIZE (sse2_unaligned_erms);
>
> return OPTIMIZE (sse2_unaligned);
Should we do similar checks for ifunc-memmove.h as well since they also
have non-temporal code paths?
On Wed, Jul 10, 2024 at 10:03 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 10/07/24 03:52, Noah Goldstein wrote:
> > The goal of this flag is to allow targets which don't prefer/have ERMS
> > to still access the non-temporal memset implementation.
> > ---
> > sysdeps/x86/cpu-tunables.c | 2 ++
> > sysdeps/x86/dl-cacheinfo.h | 28 +++++++++++++++++--
> > ...cpu-features-preferred_feature_index_1.def | 1 +
> > sysdeps/x86/tst-hwcap-tunables.c | 4 +--
> > sysdeps/x86_64/multiarch/ifunc-memset.h | 15 ++++++----
> > 5 files changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index 89da7a03da..8f6032b510 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> > Fast_Unaligned_Load, 19);
> > CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > Fast_Unaligned_Copy, 19);
> > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > + Prefer_Non_Temporal, 19);
> > }
> > break;
> > case 20:
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..2cb70a0d0d 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > slightly better than ERMS. */
> > rep_stosb_threshold = SIZE_MAX;
> >
> > + /*
> > + * For memset, the non-temporal implementation is only accessed through the
> > + * stosb code. ie:
> > + * ```
> > + * if (size >= rep_stosb_thresh)
> > + * {
> > + * if (size >= non_temporal_thresh)
> > + * {
> > + * do_non_temporal ();
> > + * }
> > + * do_stosb ();
> > + * }
> > + * do_normal_vec_loop ();
> > + * ```
> > + * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> > + * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> > + *`rep stosb` will never be used.
> > + */
>
> Small nit, multi-line comments usually don't have the '*' for each line:
>
> /*
> <comment>
> */
Ack.
>
> > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > + memset_non_temporal_threshold,
> > + minimum_non_temporal_threshold, SIZE_MAX);
> > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > + rep_stosb_threshold
> > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> > +
> > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> > minimum_non_temporal_threshold,
> > maximum_non_temporal_threshold);
> > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > - memset_non_temporal_threshold,
> > - minimum_non_temporal_threshold, SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> > minimum_rep_movsb_threshold, SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..5c923b3dcb 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> > BIT (MathVec_Prefer_No_AVX512)
> > BIT (Prefer_FSRM)
> > BIT (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Prefer_Non_Temporal)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..aec852770a 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> > /* Disable everything. */
> > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > - "-AVX_Fast_Unaligned_Load",
> > + "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> > test_1,
> > array_length (test_1)
> > },
> > @@ -68,7 +68,7 @@ static const struct test_t
> > /* Same as before, but with some empty suboptions. */
> > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > - "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > + "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> > test_1,
> > array_length (test_1)
> > }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > index 7a637ef7ca..27f04e9dce 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (avx512_unaligned_erms);
> >
> > return OPTIMIZE (avx512_unaligned);
> > @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (evex_unaligned_erms);
> >
> > return OPTIMIZE (evex_unaligned);
> > @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> >
> > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (avx2_unaligned_erms_rtm);
> >
> > return OPTIMIZE (avx2_unaligned_rtm);
> > @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
> > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > Prefer_No_VZEROUPPER, !))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (avx2_unaligned_erms);
> >
> > return OPTIMIZE (avx2_unaligned);
> > }
> > }
> >
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (sse2_unaligned_erms);
> >
> > return OPTIMIZE (sse2_unaligned);
>
> Should we do similar checks for ifunc-memmove.h as well since they also
> have non-temporal code paths?
At the moment __memmove_vec_unaligned_erms is implemented w/ a path
for non-temporal stores that doesn't go through ERMS codes.
If that changes, however, we should update this.
Hi, Noah
We are creating hygon branch and try to enable nt memset based on this patch.
However, we have a problem which blocked us from using nt memset
In dl_init_cacheinfo line 1050:
/* Non-temporal stores are more performant on Intel and AMD hardware above
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
&& (cpu_features->basic.kind == arch_kind_intel
|| cpu_features->basic.kind == arch_kind_amd))
memset_non_temporal_threshold = non_temporal_threshold;
Here, if we need to add 'Prefer_Non_Temporal' case and change as below:
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
&& (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
||(cpu_features->basic.kind == arch_kind_intel
|| cpu_features->basic.kind == arch_kind_amd))
memset_non_temporal_threshold = non_temporal_threshold;
or let vendor to do this with themselves:
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
&& (cpu_features->basic.kind == arch_kind_intel)
|| (cpu_features->basic.kind == arch_kind_amd)
|| (cpu_features->basic.kind == arch_kind_hygon))
memset_non_temporal_threshold = non_temporal_threshold;
Best Regards
Feifei
> -----邮件原件-----
> 发件人: Feifei Wang
> 发送时间: 2024年7月10日 15:34
> 收件人: 'Noah Goldstein' <goldstein.w.n@gmail.com>;
> libc-alpha@sourceware.org
> 抄送: hjl.tools@gmail.com; Jing Li <lijing@hygon.cn>
> 主题: 答复: x86: Add new cpu-flag `Prefer_Non_Temporal`
>
>
>
> > -----邮件原件-----
> > 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> > 发送时间: 2024年7月10日 14:52
> > 收件人: libc-alpha@sourceware.org
> > 抄送: goldstein.w.n@gmail.com; hjl.tools@gmail.com; Feifei Wang
> > <wangfeifei@hygon.cn>
> > 主题: x86: Add new cpu-flag `Prefer_Non_Temporal`
> >
> > The goal of this flag is to allow targets which don't prefer/have ERMS
> > to still access the non-temporal memset implementation.
> > ---
> > sysdeps/x86/cpu-tunables.c | 2 ++
> > sysdeps/x86/dl-cacheinfo.h | 28
> > +++++++++++++++++--
> > ...cpu-features-preferred_feature_index_1.def | 1 +
> > sysdeps/x86/tst-hwcap-tunables.c | 4 +--
> > sysdeps/x86_64/multiarch/ifunc-memset.h | 15 ++++++----
> > 5 files changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index
> > 89da7a03da..8f6032b510 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t
> > *valp)
> > Fast_Unaligned_Load, 19);
> > CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > Fast_Unaligned_Copy, 19);
> > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > + Prefer_Non_Temporal, 19);
> > }
> > break;
> > case 20:
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..2cb70a0d0d 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features
> > *cpu_features)
> > slightly better than ERMS. */
> > rep_stosb_threshold = SIZE_MAX;
> >
> > + /*
> > + * For memset, the non-temporal implementation is only accessed
> > + through
> > the
> > + * stosb code. ie:
> > + * ```
> > + * if (size >= rep_stosb_thresh)
> > + * {
> > + * if (size >= non_temporal_thresh)
> > + * {
> > + * do_non_temporal ();
> > + * }
> > + * do_stosb ();
> > + * }
> > + * do_normal_vec_loop ();
> > + * ```
> > + * So if we prefer non-temporal, set `rep_stosb_thresh =
> > non_temporal_thresh`
> > + * to enable the implementation. If `rep_stosb_thresh =
> > non_temporal_thresh`,
> Maybe " If `rep_stosb_thresh == non_temporal_thresh`
> > + *`rep stosb` will never be used.
> > + */
> > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > + memset_non_temporal_threshold,
> > + minimum_non_temporal_threshold, SIZE_MAX);
> > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > + rep_stosb_threshold
> > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> > +
> > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0,
> SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0,
> > SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold,
> > non_temporal_threshold,
> > minimum_non_temporal_threshold,
> > maximum_non_temporal_threshold);
> > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > - memset_non_temporal_threshold,
> > - minimum_non_temporal_threshold, SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold,
> > rep_movsb_threshold,
> > minimum_rep_movsb_threshold, SIZE_MAX);
> > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold,
> > rep_stosb_threshold, 1, diff --git
> > a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..5c923b3dcb 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) BIT
> > (MathVec_Prefer_No_AVX512) BIT (Prefer_FSRM) BIT
> > (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Prefer_Non_Temporal)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> > b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..aec852770a 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> > /* Disable everything. */
> > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > - "-AVX_Fast_Unaligned_Load",
> > + "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> > test_1,
> > array_length (test_1)
> > },
> > @@ -68,7 +68,7 @@ static const struct test_t
> > /* Same as before, but with some empty suboptions. */
> > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > - "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > + "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> > test_1,
> > array_length (test_1)
> > }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > index 7a637ef7ca..27f04e9dce 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (avx512_unaligned_erms);
> >
> > return OPTIMIZE (avx512_unaligned); @@ -76,7 +77,8 @@
> > IFUNC_SELECTOR (void)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (evex_unaligned_erms);
> >
> > return OPTIMIZE (evex_unaligned);
> > @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> >
> > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (avx2_unaligned_erms_rtm);
> >
> > return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +96,16 @@
> > IFUNC_SELECTOR (void)
> > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > Prefer_No_VZEROUPPER, !))
> > {
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (avx2_unaligned_erms);
> >
> > return OPTIMIZE (avx2_unaligned);
> > }
> > }
> >
> > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > return OPTIMIZE (sse2_unaligned_erms);
> >
> > return OPTIMIZE (sse2_unaligned);
> > --
> > 2.34.1
>
> Reviewed-by: Feifei Wang <wangfeifei@hygon.cn>
On Thu, Aug 8, 2024 at 3:35 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
>
> Hi, Noah
>
> We are creating hygon branch and try to enable nt memset based on this patch.
> However, we have a problem which blocked us from using nt memset
> In dl_init_cacheinfo line 1050:
>
> /* Non-temporal stores are more performant on Intel and AMD hardware above
> non_temporal_threshold. Enable this for both Intel and AMD hardware. */
> unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> && (cpu_features->basic.kind == arch_kind_intel
> || cpu_features->basic.kind == arch_kind_amd))
> memset_non_temporal_threshold = non_temporal_threshold;
>
>
> Here, if we need to add 'Prefer_Non_Temporal' case and change as below:
> if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> ||(cpu_features->basic.kind == arch_kind_intel
> || cpu_features->basic.kind == arch_kind_amd))
> memset_non_temporal_threshold = non_temporal_threshold;
>
> or let vendor to do this with themselves:
> if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> && (cpu_features->basic.kind == arch_kind_intel)
> || (cpu_features->basic.kind == arch_kind_amd)
> || (cpu_features->basic.kind == arch_kind_hygon))
> memset_non_temporal_threshold = non_temporal_threshold;
>
> Best Regards
> Feifei
Did:
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
&& (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal)
|| cpu_features->basic.kind == arch_kind_intel
|| cpu_features->basic.kind == arch_kind_amd))
memset_non_temporal_threshold = non_temporal_threshold;
So for hygon (or any equiv arch) just set `Prefer_Non_Temporal`
(and don't set `Avoid_Non_Temporal_Memset`) and you will
be fine.
[[...snip...]
> -----邮件原件-----
> 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> 发送时间: 2024年8月11日 13:59
> 收件人: Feifei Wang <wangfeifei@hygon.cn>
> 抄送: libc-alpha@sourceware.org; hjl.tools@gmail.com; Jing Li
> <lijing@hygon.cn>
> 主题: Re: x86: Add new cpu-flag `Prefer_Non_Temporal`
>
> On Thu, Aug 8, 2024 at 3:35 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> >
> > Hi, Noah
> >
> > We are creating hygon branch and try to enable nt memset based on this
> patch.
> > However, we have a problem which blocked us from using nt memset In
> > dl_init_cacheinfo line 1050:
> >
> > /* Non-temporal stores are more performant on Intel and AMD hardware
> above
> > non_temporal_threshold. Enable this for both Intel and AMD hardware.
> */
> > unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> > if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset)
> > && (cpu_features->basic.kind == arch_kind_intel
> > || cpu_features->basic.kind == arch_kind_amd))
> > memset_non_temporal_threshold = non_temporal_threshold;
> >
> >
> > Here, if we need to add 'Prefer_Non_Temporal' case and change as below:
> > if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset)
> > && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > ||(cpu_features->basic.kind == arch_kind_intel
> > || cpu_features->basic.kind == arch_kind_amd))
> > memset_non_temporal_threshold = non_temporal_threshold;
> >
> > or let vendor to do this with themselves:
> > if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset)
> > && (cpu_features->basic.kind == arch_kind_intel)
> > || (cpu_features->basic.kind == arch_kind_amd)
> > || (cpu_features->basic.kind == arch_kind_hygon))
> > memset_non_temporal_threshold = non_temporal_threshold;
> >
> > Best Regards
> > Feifei
>
> Did:
>
> if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal)
> || cpu_features->basic.kind == arch_kind_intel
> || cpu_features->basic.kind == arch_kind_amd))
> memset_non_temporal_threshold = non_temporal_threshold;
>
> So for hygon (or any equiv arch) just set `Prefer_Non_Temporal` (and don't set
> `Avoid_Non_Temporal_Memset`) and you will be fine.
That's good. Thanks for this change.
>
> [[...snip...]
@@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
Fast_Unaligned_Load, 19);
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Fast_Unaligned_Copy, 19);
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ Prefer_Non_Temporal, 19);
}
break;
case 20:
@@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
slightly better than ERMS. */
rep_stosb_threshold = SIZE_MAX;
+ /*
+ * For memset, the non-temporal implementation is only accessed through the
+ * stosb code. ie:
+ * ```
+ * if (size >= rep_stosb_thresh)
+ * {
+ * if (size >= non_temporal_thresh)
+ * {
+ * do_non_temporal ();
+ * }
+ * do_stosb ();
+ * }
+ * do_normal_vec_loop ();
+ * ```
+ * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+ * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+ *`rep stosb` will never be used.
+ */
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+ memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, SIZE_MAX);
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
+ rep_stosb_threshold
+ = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
- TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
- memset_non_temporal_threshold,
- minimum_non_temporal_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Prefer_Non_Temporal)
@@ -60,7 +60,7 @@ static const struct test_t
/* Disable everything. */
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
- "-AVX_Fast_Unaligned_Load",
+ "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
test_1,
array_length (test_1)
},
@@ -68,7 +68,7 @@ static const struct test_t
/* Same as before, but with some empty suboptions. */
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
- "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
test_1,
array_length (test_1)
}
@@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
return OPTIMIZE (avx512_unaligned_erms);
return OPTIMIZE (avx512_unaligned);
@@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
return OPTIMIZE (evex_unaligned_erms);
return OPTIMIZE (evex_unaligned);
@@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
return OPTIMIZE (avx2_unaligned_erms_rtm);
return OPTIMIZE (avx2_unaligned_rtm);
@@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
Prefer_No_VZEROUPPER, !))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
return OPTIMIZE (avx2_unaligned_erms);
return OPTIMIZE (avx2_unaligned);
}
}
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
return OPTIMIZE (sse2_unaligned_erms);
return OPTIMIZE (sse2_unaligned);