x86: Add new cpu-flag `Prefer_Non_Temporal`

Message ID 20240710065226.2509525-1-goldstein.w.n@gmail.com (mailing list archive)
State Dropped
Headers
Series x86: Add new cpu-flag `Prefer_Non_Temporal` |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Noah Goldstein July 10, 2024, 6:52 a.m. UTC
  The goal of this flag is to allow targets which don't prefer/have ERMS
to still access the non-temporal memset implementation.
---
 sysdeps/x86/cpu-tunables.c                    |  2 ++
 sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
 ...cpu-features-preferred_feature_index_1.def |  1 +
 sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
 sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
 5 files changed, 40 insertions(+), 10 deletions(-)
  

Comments

Noah Goldstein July 10, 2024, 6:53 a.m. UTC | #1
On Wed, Jul 10, 2024 at 2:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The goal of this flag is to allow targets which don't prefer/have ERMS
> to still access the non-temporal memset implementation.
> ---
>  sysdeps/x86/cpu-tunables.c                    |  2 ++
>  sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
>  ...cpu-features-preferred_feature_index_1.def |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
>  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
>  5 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>                                                 Fast_Unaligned_Load, 19);
>               CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>                                                 Fast_Unaligned_Copy, 19);
> +             CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +                                               Prefer_Non_Temporal, 19);
>             }
>           break;
>         case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>         slightly better than ERMS.  */
>      rep_stosb_threshold = SIZE_MAX;
>
> +  /*
> +   * For memset, the non-temporal implementation is only accessed through the
> +   * stosb code. ie:
> +   * ```
> +   * if (size >= rep_stosb_thresh)
> +   * {
> +   *   if (size >= non_temporal_thresh)
> +   * {
> +   * do_non_temporal ();
> +   * }
> +   *   do_stosb ();
> +   * }
> +   * do_normal_vec_loop ();
> +   * ```
> +   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> +   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> +   *`rep stosb` will never be used.
> +   */
> +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> +                          memset_non_temporal_threshold,
> +                          minimum_non_temporal_threshold, SIZE_MAX);
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> +    rep_stosb_threshold
> +       = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
>                            minimum_non_temporal_threshold,
>                            maximum_non_temporal_threshold);
> -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> -                          memset_non_temporal_threshold,
> -                          minimum_non_temporal_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
>                            minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
>      test_1,
>      array_length (test_1)
>    }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (avx512_unaligned_erms);
>
>           return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (evex_unaligned_erms);
>
>           return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (avx2_unaligned_erms_rtm);
>
>           return OPTIMIZE (avx2_unaligned_rtm);
> @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
>                                        Prefer_No_VZEROUPPER, !))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (avx2_unaligned_erms);
>
>           return OPTIMIZE (avx2_unaligned);
>         }
>      }
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>      return OPTIMIZE (sse2_unaligned_erms);
>
>    return OPTIMIZE (sse2_unaligned);
> --
> 2.34.1
>
NB: For after 2.40 release
  
Feifei Wang July 10, 2024, 7:34 a.m. UTC | #2
> -----邮件原件-----
> 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> 发送时间: 2024年7月10日 14:52
> 收件人: libc-alpha@sourceware.org
> 抄送: goldstein.w.n@gmail.com; hjl.tools@gmail.com; Feifei Wang
> <wangfeifei@hygon.cn>
> 主题: x86: Add new cpu-flag `Prefer_Non_Temporal`
> 
> The goal of this flag is to allow targets which don't prefer/have ERMS to still
> access the non-temporal memset implementation.
> ---
>  sysdeps/x86/cpu-tunables.c                    |  2 ++
>  sysdeps/x86/dl-cacheinfo.h                    | 28
> +++++++++++++++++--
>  ...cpu-features-preferred_feature_index_1.def |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
>  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
>  5 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index
> 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t
> *valp)
>  						Fast_Unaligned_Load, 19);
>  	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>  						Fast_Unaligned_Copy, 19);
> +	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +						Prefer_Non_Temporal, 19);
>  	    }
>  	  break;
>  	case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index
> 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
>         slightly better than ERMS.  */
>      rep_stosb_threshold = SIZE_MAX;
> 
> +  /*
> +   * For memset, the non-temporal implementation is only accessed through
> the
> +   * stosb code. ie:
> +   * ```
> +   * if (size >= rep_stosb_thresh)
> +   * {
> +   *	if (size >= non_temporal_thresh)
> +   * {
> +   * do_non_temporal ();
> +   * }
> +   *	do_stosb ();
> +   * }
> +   * do_normal_vec_loop ();
> +   * ```
> +   * So if we prefer non-temporal, set `rep_stosb_thresh =
> non_temporal_thresh`
> +   * to enable the implementation. If `rep_stosb_thresh =
> non_temporal_thresh`,
       Maybe " If `rep_stosb_thresh == non_temporal_thresh`
> +   *`rep stosb` will never be used.
> +   */
> +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> +			   memset_non_temporal_threshold,
> +			   minimum_non_temporal_threshold, SIZE_MAX);
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> +    rep_stosb_threshold
> +	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0,
> SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold,
> non_temporal_threshold,
>  			   minimum_non_temporal_threshold,
>  			   maximum_non_temporal_threshold);
> -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> -			   memset_non_temporal_threshold,
> -			   minimum_non_temporal_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold,
> rep_movsb_threshold,
>  			   minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold,
> rep_stosb_threshold, 1, diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)  BIT
> (MathVec_Prefer_No_AVX512)  BIT (Prefer_FSRM)  BIT
> (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
>      test_1,
>      array_length (test_1)
>    }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h
> b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx512_unaligned_erms);
> 
>  	  return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (evex_unaligned_erms);
> 
>  	  return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> 
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms_rtm);
> 
>  	  return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +96,16 @@
> IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
>  				       Prefer_No_VZEROUPPER, !))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms);
> 
>  	  return OPTIMIZE (avx2_unaligned);
>  	}
>      }
> 
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>      return OPTIMIZE (sse2_unaligned_erms);
> 
>    return OPTIMIZE (sse2_unaligned);
> --
> 2.34.1

Reviewed-by: Feifei Wang <wangfeifei@hygon.cn>
  
Adhemerval Zanella July 10, 2024, 2:03 p.m. UTC | #3
On 10/07/24 03:52, Noah Goldstein wrote:
> The goal of this flag is to allow targets which don't prefer/have ERMS
> to still access the non-temporal memset implementation.
> ---
>  sysdeps/x86/cpu-tunables.c                    |  2 ++
>  sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
>  ...cpu-features-preferred_feature_index_1.def |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
>  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
>  5 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>  						Fast_Unaligned_Load, 19);
>  	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>  						Fast_Unaligned_Copy, 19);
> +	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +						Prefer_Non_Temporal, 19);
>  	    }
>  	  break;
>  	case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>         slightly better than ERMS.  */
>      rep_stosb_threshold = SIZE_MAX;
>  
> +  /*
> +   * For memset, the non-temporal implementation is only accessed through the
> +   * stosb code. ie:
> +   * ```
> +   * if (size >= rep_stosb_thresh)
> +   * {
> +   *	if (size >= non_temporal_thresh)
> +   * {
> +   * do_non_temporal ();
> +   * }
> +   *	do_stosb ();
> +   * }
> +   * do_normal_vec_loop ();
> +   * ```
> +   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> +   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> +   *`rep stosb` will never be used.
> +   */

Small nit, multi-line comments usually don't have the '*' for each line:

/*
 <comment>
 */

> +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> +			   memset_non_temporal_threshold,
> +			   minimum_non_temporal_threshold, SIZE_MAX);
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> +    rep_stosb_threshold
> +	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
>  			   minimum_non_temporal_threshold,
>  			   maximum_non_temporal_threshold);
> -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> -			   memset_non_temporal_threshold,
> -			   minimum_non_temporal_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
>  			   minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
>      test_1,
>      array_length (test_1)
>    }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx512_unaligned_erms);
>  
>  	  return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (evex_unaligned_erms);
>  
>  	  return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>  
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms_rtm);
>  
>  	  return OPTIMIZE (avx2_unaligned_rtm);
> @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
>  				       Prefer_No_VZEROUPPER, !))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms);
>  
>  	  return OPTIMIZE (avx2_unaligned);
>  	}
>      }
>  
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>      return OPTIMIZE (sse2_unaligned_erms);
>  
>    return OPTIMIZE (sse2_unaligned);

Should we do similar checks for ifunc-memmove.h as well since they also
have non-temporal code paths?
  
Noah Goldstein July 10, 2024, 2:20 p.m. UTC | #4
On Wed, Jul 10, 2024 at 10:03 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 10/07/24 03:52, Noah Goldstein wrote:
> > The goal of this flag is to allow targets which don't prefer/have ERMS
> > to still access the non-temporal memset implementation.
> > ---
> >  sysdeps/x86/cpu-tunables.c                    |  2 ++
> >  sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
> >  ...cpu-features-preferred_feature_index_1.def |  1 +
> >  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
> >  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
> >  5 files changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index 89da7a03da..8f6032b510 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> >                                               Fast_Unaligned_Load, 19);
> >             CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> >                                               Fast_Unaligned_Copy, 19);
> > +           CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > +                                             Prefer_Non_Temporal, 19);
> >           }
> >         break;
> >       case 20:
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..2cb70a0d0d 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >         slightly better than ERMS.  */
> >      rep_stosb_threshold = SIZE_MAX;
> >
> > +  /*
> > +   * For memset, the non-temporal implementation is only accessed through the
> > +   * stosb code. ie:
> > +   * ```
> > +   * if (size >= rep_stosb_thresh)
> > +   * {
> > +   * if (size >= non_temporal_thresh)
> > +   * {
> > +   * do_non_temporal ();
> > +   * }
> > +   * do_stosb ();
> > +   * }
> > +   * do_normal_vec_loop ();
> > +   * ```
> > +   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> > +   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> > +   *`rep stosb` will never be used.
> > +   */
>
> Small nit, multi-line comments usually don't have the '*' for each line:
>
> /*
>  <comment>
>  */

Ack.

>
> > +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > +                        memset_non_temporal_threshold,
> > +                        minimum_non_temporal_threshold, SIZE_MAX);
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > +    rep_stosb_threshold
> > +     = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> > +
> >    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> >                          minimum_non_temporal_threshold,
> >                          maximum_non_temporal_threshold);
> > -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > -                        memset_non_temporal_threshold,
> > -                        minimum_non_temporal_threshold, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> >                          minimum_rep_movsb_threshold, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..5c923b3dcb 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >  BIT (MathVec_Prefer_No_AVX512)
> >  BIT (Prefer_FSRM)
> >  BIT (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Prefer_Non_Temporal)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..aec852770a 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> >      /* Disable everything.  */
> >      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > -    "-AVX_Fast_Unaligned_Load",
> > +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> >      test_1,
> >      array_length (test_1)
> >    },
> > @@ -68,7 +68,7 @@ static const struct test_t
> >      /* Same as before, but with some empty suboptions.  */
> >      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> >      test_1,
> >      array_length (test_1)
> >    }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > index 7a637ef7ca..27f04e9dce 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (avx512_unaligned_erms);
> >
> >         return OPTIMIZE (avx512_unaligned);
> > @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (evex_unaligned_erms);
> >
> >         return OPTIMIZE (evex_unaligned);
> > @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> >
> >        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (avx2_unaligned_erms_rtm);
> >
> >         return OPTIMIZE (avx2_unaligned_rtm);
> > @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
> >        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> >                                      Prefer_No_VZEROUPPER, !))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (avx2_unaligned_erms);
> >
> >         return OPTIMIZE (avx2_unaligned);
> >       }
> >      }
> >
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >      return OPTIMIZE (sse2_unaligned_erms);
> >
> >    return OPTIMIZE (sse2_unaligned);
>
> Should we do similar checks for ifunc-memmove.h as well since they also
> have non-temporal code paths?

At the moment __memmove_vec_unaligned_erms is implemented w/ a path
for non-temporal stores that doesn't go through ERMS codes.

If that changes, however, we should update this.
  
Feifei Wang Aug. 8, 2024, 7:35 a.m. UTC | #5
Hi, Noah

We are creating hygon branch and try to enable nt memset based on this patch.
However,  we have a problem which blocked us from using nt memset
In dl_init_cacheinfo line 1050:

  /* Non-temporal stores are more performant on Intel and AMD hardware above
     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
      && (cpu_features->basic.kind == arch_kind_intel
	  || cpu_features->basic.kind == arch_kind_amd))
    memset_non_temporal_threshold = non_temporal_threshold;


Here, if we need to add 'Prefer_Non_Temporal' case and change as below:
  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
      && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
	  ||(cpu_features->basic.kind == arch_kind_intel
	  || cpu_features->basic.kind == arch_kind_amd))
    memset_non_temporal_threshold = non_temporal_threshold;

or let vendor to do this with themselves:
  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
      && (cpu_features->basic.kind == arch_kind_intel)
	  || (cpu_features->basic.kind == arch_kind_amd)
	  || (cpu_features->basic.kind == arch_kind_hygon))
    memset_non_temporal_threshold = non_temporal_threshold;

Best Regards
Feifei

> -----邮件原件-----
> 发件人: Feifei Wang
> 发送时间: 2024年7月10日 15:34
> 收件人: 'Noah Goldstein' <goldstein.w.n@gmail.com>;
> libc-alpha@sourceware.org
> 抄送: hjl.tools@gmail.com; Jing Li <lijing@hygon.cn>
> 主题: 答复: x86: Add new cpu-flag `Prefer_Non_Temporal`
> 
> 
> 
> > -----邮件原件-----
> > 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> > 发送时间: 2024年7月10日 14:52
> > 收件人: libc-alpha@sourceware.org
> > 抄送: goldstein.w.n@gmail.com; hjl.tools@gmail.com; Feifei Wang
> > <wangfeifei@hygon.cn>
> > 主题: x86: Add new cpu-flag `Prefer_Non_Temporal`
> >
> > The goal of this flag is to allow targets which don't prefer/have ERMS
> > to still access the non-temporal memset implementation.
> > ---
> >  sysdeps/x86/cpu-tunables.c                    |  2 ++
> >  sysdeps/x86/dl-cacheinfo.h                    | 28
> > +++++++++++++++++--
> >  ...cpu-features-preferred_feature_index_1.def |  1 +
> >  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
> >  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
> >  5 files changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index
> > 89da7a03da..8f6032b510 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t
> > *valp)
> >  						Fast_Unaligned_Load, 19);
> >  	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> >  						Fast_Unaligned_Copy, 19);
> > +	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > +						Prefer_Non_Temporal, 19);
> >  	    }
> >  	  break;
> >  	case 20:
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..2cb70a0d0d 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features
> > *cpu_features)
> >         slightly better than ERMS.  */
> >      rep_stosb_threshold = SIZE_MAX;
> >
> > +  /*
> > +   * For memset, the non-temporal implementation is only accessed
> > + through
> > the
> > +   * stosb code. ie:
> > +   * ```
> > +   * if (size >= rep_stosb_thresh)
> > +   * {
> > +   *	if (size >= non_temporal_thresh)
> > +   * {
> > +   * do_non_temporal ();
> > +   * }
> > +   *	do_stosb ();
> > +   * }
> > +   * do_normal_vec_loop ();
> > +   * ```
> > +   * So if we prefer non-temporal, set `rep_stosb_thresh =
> > non_temporal_thresh`
> > +   * to enable the implementation. If `rep_stosb_thresh =
> > non_temporal_thresh`,
>        Maybe " If `rep_stosb_thresh == non_temporal_thresh`
> > +   *`rep stosb` will never be used.
> > +   */
> > +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > +			   memset_non_temporal_threshold,
> > +			   minimum_non_temporal_threshold, SIZE_MAX);
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > +    rep_stosb_threshold
> > +	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> > +
> >    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0,
> SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0,
> > SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold,
> > non_temporal_threshold,
> >  			   minimum_non_temporal_threshold,
> >  			   maximum_non_temporal_threshold);
> > -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > -			   memset_non_temporal_threshold,
> > -			   minimum_non_temporal_threshold, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold,
> > rep_movsb_threshold,
> >  			   minimum_rep_movsb_threshold, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold,
> > rep_stosb_threshold, 1, diff --git
> > a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..5c923b3dcb 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)  BIT
> > (MathVec_Prefer_No_AVX512)  BIT (Prefer_FSRM)  BIT
> > (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Prefer_Non_Temporal)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> > b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..aec852770a 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> >      /* Disable everything.  */
> >      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > -    "-AVX_Fast_Unaligned_Load",
> > +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> >      test_1,
> >      array_length (test_1)
> >    },
> > @@ -68,7 +68,7 @@ static const struct test_t
> >      /* Same as before, but with some empty suboptions.  */
> >      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> >      test_1,
> >      array_length (test_1)
> >    }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > index 7a637ef7ca..27f04e9dce 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> >  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> >  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >  	{
> > -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >  	    return OPTIMIZE (avx512_unaligned_erms);
> >
> >  	  return OPTIMIZE (avx512_unaligned); @@ -76,7 +77,8 @@
> > IFUNC_SELECTOR (void)
> >  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> >  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >  	{
> > -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >  	    return OPTIMIZE (evex_unaligned_erms);
> >
> >  	  return OPTIMIZE (evex_unaligned);
> > @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> >
> >        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> >  	{
> > -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >  	    return OPTIMIZE (avx2_unaligned_erms_rtm);
> >
> >  	  return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +96,16 @@
> > IFUNC_SELECTOR (void)
> >        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> >  				       Prefer_No_VZEROUPPER, !))
> >  	{
> > -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >  	    return OPTIMIZE (avx2_unaligned_erms);
> >
> >  	  return OPTIMIZE (avx2_unaligned);
> >  	}
> >      }
> >
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >      return OPTIMIZE (sse2_unaligned_erms);
> >
> >    return OPTIMIZE (sse2_unaligned);
> > --
> > 2.34.1
> 
> Reviewed-by: Feifei Wang <wangfeifei@hygon.cn>
  
Noah Goldstein Aug. 11, 2024, 5:58 a.m. UTC | #6
On Thu, Aug 8, 2024 at 3:35 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
>
> Hi, Noah
>
> We are creating hygon branch and try to enable nt memset based on this patch.
> However,  we have a problem which blocked us from using nt memset
> In dl_init_cacheinfo line 1050:
>
>   /* Non-temporal stores are more performant on Intel and AMD hardware above
>      non_temporal_threshold. Enable this for both Intel and AMD hardware. */
>   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
>       && (cpu_features->basic.kind == arch_kind_intel
>           || cpu_features->basic.kind == arch_kind_amd))
>     memset_non_temporal_threshold = non_temporal_threshold;
>
>
> Here, if we need to add 'Prefer_Non_Temporal' case and change as below:
>   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
>       && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>           ||(cpu_features->basic.kind == arch_kind_intel
>           || cpu_features->basic.kind == arch_kind_amd))
>     memset_non_temporal_threshold = non_temporal_threshold;
>
> or let vendor to do this with themselves:
>   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
>       && (cpu_features->basic.kind == arch_kind_intel)
>           || (cpu_features->basic.kind == arch_kind_amd)
>           || (cpu_features->basic.kind == arch_kind_hygon))
>     memset_non_temporal_threshold = non_temporal_threshold;
>
> Best Regards
> Feifei

Did:

   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
       && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal)
           || cpu_features->basic.kind == arch_kind_intel
           || cpu_features->basic.kind == arch_kind_amd))
     memset_non_temporal_threshold = non_temporal_threshold;

So for hygon (or any equiv arch) just set `Prefer_Non_Temporal`
(and don't set `Avoid_Non_Temporal_Memset`) and you will
be fine.

[[...snip...]
  
Feifei Wang Aug. 12, 2024, 1:50 a.m. UTC | #7
> -----邮件原件-----
> 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> 发送时间: 2024年8月11日 13:59
> 收件人: Feifei Wang <wangfeifei@hygon.cn>
> 抄送: libc-alpha@sourceware.org; hjl.tools@gmail.com; Jing Li
> <lijing@hygon.cn>
> 主题: Re: x86: Add new cpu-flag `Prefer_Non_Temporal`
> 
> On Thu, Aug 8, 2024 at 3:35 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> >
> > Hi, Noah
> >
> > We are creating hygon branch and try to enable nt memset based on this
> patch.
> > However,  we have a problem which blocked us from using nt memset In
> > dl_init_cacheinfo line 1050:
> >
> >   /* Non-temporal stores are more performant on Intel and AMD hardware
> above
> >      non_temporal_threshold. Enable this for both Intel and AMD hardware.
> */
> >   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> >   if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset)
> >       && (cpu_features->basic.kind == arch_kind_intel
> >           || cpu_features->basic.kind == arch_kind_amd))
> >     memset_non_temporal_threshold = non_temporal_threshold;
> >
> >
> > Here, if we need to add 'Prefer_Non_Temporal' case and change as below:
> >   if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset)
> >       && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           ||(cpu_features->basic.kind == arch_kind_intel
> >           || cpu_features->basic.kind == arch_kind_amd))
> >     memset_non_temporal_threshold = non_temporal_threshold;
> >
> > or let vendor to do this with themselves:
> >   if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset)
> >       && (cpu_features->basic.kind == arch_kind_intel)
> >           || (cpu_features->basic.kind == arch_kind_amd)
> >           || (cpu_features->basic.kind == arch_kind_hygon))
> >     memset_non_temporal_threshold = non_temporal_threshold;
> >
> > Best Regards
> > Feifei
> 
> Did:
> 
>    if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
>        && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal)
>            || cpu_features->basic.kind == arch_kind_intel
>            || cpu_features->basic.kind == arch_kind_amd))
>      memset_non_temporal_threshold = non_temporal_threshold;
> 
> So for hygon (or any equiv arch) just set `Prefer_Non_Temporal` (and don't set
> `Avoid_Non_Temporal_Memset`) and you will be fine.

That's good. Thanks for this change.

> 
> [[...snip...]
  

Patch

diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 89da7a03da..8f6032b510 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -223,6 +223,8 @@  TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 						Fast_Unaligned_Load, 19);
 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
 						Fast_Unaligned_Copy, 19);
+	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+						Prefer_Non_Temporal, 19);
 	    }
 	  break;
 	case 20:
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5e77345a6e..2cb70a0d0d 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1041,14 +1041,36 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
        slightly better than ERMS.  */
     rep_stosb_threshold = SIZE_MAX;
 
+  /*
+   * For memset, the non-temporal implementation is only accessed through the
+   * stosb code. ie:
+   * ```
+   * if (size >= rep_stosb_thresh)
+   * {
+   *	if (size >= non_temporal_thresh)
+   * {
+   * do_non_temporal ();
+   * }
+   *	do_stosb ();
+   * }
+   * do_normal_vec_loop ();
+   * ```
+   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+   *`rep stosb` will never be used.
+   */
+  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+			   memset_non_temporal_threshold,
+			   minimum_non_temporal_threshold, SIZE_MAX);
+  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
+    rep_stosb_threshold
+	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 			   minimum_non_temporal_threshold,
 			   maximum_non_temporal_threshold);
-  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
-			   memset_non_temporal_threshold,
-			   minimum_non_temporal_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 			   minimum_rep_movsb_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 85e7f54ec8..5c923b3dcb 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@  BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Prefer_Non_Temporal)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 8589a9fd66..aec852770a 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,7 @@  static const struct test_t
     /* Disable everything.  */
     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
-    "-AVX_Fast_Unaligned_Load",
+    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
     test_1,
     array_length (test_1)
   },
@@ -68,7 +68,7 @@  static const struct test_t
     /* Same as before, but with some empty suboptions.  */
     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
-    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
     test_1,
     array_length (test_1)
   }
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 7a637ef7ca..27f04e9dce 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -61,7 +61,8 @@  IFUNC_SELECTOR (void)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (avx512_unaligned_erms);
 
 	  return OPTIMIZE (avx512_unaligned);
@@ -76,7 +77,8 @@  IFUNC_SELECTOR (void)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (evex_unaligned_erms);
 
 	  return OPTIMIZE (evex_unaligned);
@@ -84,7 +86,8 @@  IFUNC_SELECTOR (void)
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (avx2_unaligned_erms_rtm);
 
 	  return OPTIMIZE (avx2_unaligned_rtm);
@@ -93,14 +96,16 @@  IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
 				       Prefer_No_VZEROUPPER, !))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (avx2_unaligned_erms);
 
 	  return OPTIMIZE (avx2_unaligned);
 	}
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
     return OPTIMIZE (sse2_unaligned_erms);
 
   return OPTIMIZE (sse2_unaligned);