x86: Add new cpu-flag `Prefer_Non_Temporal`

Message ID 20240710065226.2509525-1-goldstein.w.n@gmail.com
State New
Headers
Series x86: Add new cpu-flag `Prefer_Non_Temporal` |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Noah Goldstein July 10, 2024, 6:52 a.m. UTC
  The goal of this flag is to allow targets which don't prefer/have ERMS
to still access the non-temporal memset implementation.
---
 sysdeps/x86/cpu-tunables.c                    |  2 ++
 sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
 ...cpu-features-preferred_feature_index_1.def |  1 +
 sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
 sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
 5 files changed, 40 insertions(+), 10 deletions(-)
  

Comments

Noah Goldstein July 10, 2024, 6:53 a.m. UTC | #1
On Wed, Jul 10, 2024 at 2:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The goal of this flag is to allow targets which don't prefer/have ERMS
> to still access the non-temporal memset implementation.
> ---
>  sysdeps/x86/cpu-tunables.c                    |  2 ++
>  sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
>  ...cpu-features-preferred_feature_index_1.def |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
>  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
>  5 files changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>                                                 Fast_Unaligned_Load, 19);
>               CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>                                                 Fast_Unaligned_Copy, 19);
> +             CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +                                               Prefer_Non_Temporal, 19);
>             }
>           break;
>         case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>         slightly better than ERMS.  */
>      rep_stosb_threshold = SIZE_MAX;
>
> +  /*
> +   * For memset, the non-temporal implementation is only accessed through the
> +   * stosb code. ie:
> +   * ```
> +   * if (size >= rep_stosb_thresh)
> +   * {
> +   *   if (size >= non_temporal_thresh)
> +   * {
> +   * do_non_temporal ();
> +   * }
> +   *   do_stosb ();
> +   * }
> +   * do_normal_vec_loop ();
> +   * ```
> +   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> +   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> +   *`rep stosb` will never be used.
> +   */
> +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> +                          memset_non_temporal_threshold,
> +                          minimum_non_temporal_threshold, SIZE_MAX);
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> +    rep_stosb_threshold
> +       = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
>                            minimum_non_temporal_threshold,
>                            maximum_non_temporal_threshold);
> -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> -                          memset_non_temporal_threshold,
> -                          minimum_non_temporal_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
>                            minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
>      test_1,
>      array_length (test_1)
>    }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (avx512_unaligned_erms);
>
>           return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>           && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (evex_unaligned_erms);
>
>           return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (avx2_unaligned_erms_rtm);
>
>           return OPTIMIZE (avx2_unaligned_rtm);
> @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
>                                        Prefer_No_VZEROUPPER, !))
>         {
> -         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +             || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>             return OPTIMIZE (avx2_unaligned_erms);
>
>           return OPTIMIZE (avx2_unaligned);
>         }
>      }
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>      return OPTIMIZE (sse2_unaligned_erms);
>
>    return OPTIMIZE (sse2_unaligned);
> --
> 2.34.1
>
NB: For after 2.40 release
  
Feifei Wang July 10, 2024, 7:34 a.m. UTC | #2
> -----邮件原件-----
> 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> 发送时间: 2024年7月10日 14:52
> 收件人: libc-alpha@sourceware.org
> 抄送: goldstein.w.n@gmail.com; hjl.tools@gmail.com; Feifei Wang
> <wangfeifei@hygon.cn>
> 主题: x86: Add new cpu-flag `Prefer_Non_Temporal`
> 
> The goal of this flag is to allow targets which don't prefer/have ERMS to still
> access the non-temporal memset implementation.
> ---
>  sysdeps/x86/cpu-tunables.c                    |  2 ++
>  sysdeps/x86/dl-cacheinfo.h                    | 28
> +++++++++++++++++--
>  ...cpu-features-preferred_feature_index_1.def |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
>  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
>  5 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index
> 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t
> *valp)
>  						Fast_Unaligned_Load, 19);
>  	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>  						Fast_Unaligned_Copy, 19);
> +	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +						Prefer_Non_Temporal, 19);
>  	    }
>  	  break;
>  	case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index
> 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
>         slightly better than ERMS.  */
>      rep_stosb_threshold = SIZE_MAX;
> 
> +  /*
> +   * For memset, the non-temporal implementation is only accessed through
> the
> +   * stosb code. ie:
> +   * ```
> +   * if (size >= rep_stosb_thresh)
> +   * {
> +   *	if (size >= non_temporal_thresh)
> +   * {
> +   * do_non_temporal ();
> +   * }
> +   *	do_stosb ();
> +   * }
> +   * do_normal_vec_loop ();
> +   * ```
> +   * So if we prefer non-temporal, set `rep_stosb_thresh =
> non_temporal_thresh`
> +   * to enable the implementation. If `rep_stosb_thresh =
> non_temporal_thresh`,
       Maybe " If `rep_stosb_thresh == non_temporal_thresh`
> +   *`rep stosb` will never be used.
> +   */
> +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> +			   memset_non_temporal_threshold,
> +			   minimum_non_temporal_threshold, SIZE_MAX);
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> +    rep_stosb_threshold
> +	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0,
> SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold,
> non_temporal_threshold,
>  			   minimum_non_temporal_threshold,
>  			   maximum_non_temporal_threshold);
> -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> -			   memset_non_temporal_threshold,
> -			   minimum_non_temporal_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold,
> rep_movsb_threshold,
>  			   minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold,
> rep_stosb_threshold, 1, diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)  BIT
> (MathVec_Prefer_No_AVX512)  BIT (Prefer_FSRM)  BIT
> (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
>      test_1,
>      array_length (test_1)
>    }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h
> b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx512_unaligned_erms);
> 
>  	  return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (evex_unaligned_erms);
> 
>  	  return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> 
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms_rtm);
> 
>  	  return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +96,16 @@
> IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
>  				       Prefer_No_VZEROUPPER, !))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms);
> 
>  	  return OPTIMIZE (avx2_unaligned);
>  	}
>      }
> 
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>      return OPTIMIZE (sse2_unaligned_erms);
> 
>    return OPTIMIZE (sse2_unaligned);
> --
> 2.34.1

Reviewed-by: Feifei Wang <wangfeifei@hygon.cn>
  
Adhemerval Zanella Netto July 10, 2024, 2:03 p.m. UTC | #3
On 10/07/24 03:52, Noah Goldstein wrote:
> The goal of this flag is to allow targets which don't prefer/have ERMS
> to still access the non-temporal memset implementation.
> ---
>  sysdeps/x86/cpu-tunables.c                    |  2 ++
>  sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
>  ...cpu-features-preferred_feature_index_1.def |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
>  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
>  5 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..8f6032b510 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>  						Fast_Unaligned_Load, 19);
>  	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>  						Fast_Unaligned_Copy, 19);
> +	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +						Prefer_Non_Temporal, 19);
>  	    }
>  	  break;
>  	case 20:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..2cb70a0d0d 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>         slightly better than ERMS.  */
>      rep_stosb_threshold = SIZE_MAX;
>  
> +  /*
> +   * For memset, the non-temporal implementation is only accessed through the
> +   * stosb code. ie:
> +   * ```
> +   * if (size >= rep_stosb_thresh)
> +   * {
> +   *	if (size >= non_temporal_thresh)
> +   * {
> +   * do_non_temporal ();
> +   * }
> +   *	do_stosb ();
> +   * }
> +   * do_normal_vec_loop ();
> +   * ```
> +   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> +   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> +   *`rep stosb` will never be used.
> +   */

Small nit, multi-line comments usually don't have the '*' for each line:

/*
 <comment>
 */

> +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> +			   memset_non_temporal_threshold,
> +			   minimum_non_temporal_threshold, SIZE_MAX);
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> +    rep_stosb_threshold
> +	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
>  			   minimum_non_temporal_threshold,
>  			   maximum_non_temporal_threshold);
> -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> -			   memset_non_temporal_threshold,
> -			   minimum_non_temporal_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
>  			   minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..5c923b3dcb 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Prefer_Non_Temporal)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..aec852770a 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
>      test_1,
>      array_length (test_1)
>    }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 7a637ef7ca..27f04e9dce 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx512_unaligned_erms);
>  
>  	  return OPTIMIZE (avx512_unaligned);
> @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (evex_unaligned_erms);
>  
>  	  return OPTIMIZE (evex_unaligned);
> @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
>  
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms_rtm);
>  
>  	  return OPTIMIZE (avx2_unaligned_rtm);
> @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
>  				       Prefer_No_VZEROUPPER, !))
>  	{
> -	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>  	    return OPTIMIZE (avx2_unaligned_erms);
>  
>  	  return OPTIMIZE (avx2_unaligned);
>  	}
>      }
>  
> -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
>      return OPTIMIZE (sse2_unaligned_erms);
>  
>    return OPTIMIZE (sse2_unaligned);

Should we do similar checks for ifunc-memmove.h as well since they also
have non-temporal code paths?
  
Noah Goldstein July 10, 2024, 2:20 p.m. UTC | #4
On Wed, Jul 10, 2024 at 10:03 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 10/07/24 03:52, Noah Goldstein wrote:
> > The goal of this flag is to allow targets which don't prefer/have ERMS
> > to still access the non-temporal memset implementation.
> > ---
> >  sysdeps/x86/cpu-tunables.c                    |  2 ++
> >  sysdeps/x86/dl-cacheinfo.h                    | 28 +++++++++++++++++--
> >  ...cpu-features-preferred_feature_index_1.def |  1 +
> >  sysdeps/x86/tst-hwcap-tunables.c              |  4 +--
> >  sysdeps/x86_64/multiarch/ifunc-memset.h       | 15 ++++++----
> >  5 files changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index 89da7a03da..8f6032b510 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -223,6 +223,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> >                                               Fast_Unaligned_Load, 19);
> >             CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> >                                               Fast_Unaligned_Copy, 19);
> > +           CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > +                                             Prefer_Non_Temporal, 19);
> >           }
> >         break;
> >       case 20:
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..2cb70a0d0d 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1041,14 +1041,36 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >         slightly better than ERMS.  */
> >      rep_stosb_threshold = SIZE_MAX;
> >
> > +  /*
> > +   * For memset, the non-temporal implementation is only accessed through the
> > +   * stosb code. ie:
> > +   * ```
> > +   * if (size >= rep_stosb_thresh)
> > +   * {
> > +   * if (size >= non_temporal_thresh)
> > +   * {
> > +   * do_non_temporal ();
> > +   * }
> > +   * do_stosb ();
> > +   * }
> > +   * do_normal_vec_loop ();
> > +   * ```
> > +   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
> > +   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
> > +   *`rep stosb` will never be used.
> > +   */
>
> Small nit, multi-line comments usually don't have the '*' for each line:
>
> /*
>  <comment>
>  */

Ack.

>
> > +  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > +                        memset_non_temporal_threshold,
> > +                        minimum_non_temporal_threshold, SIZE_MAX);
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> > +    rep_stosb_threshold
> > +     = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> > +
> >    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> >                          minimum_non_temporal_threshold,
> >                          maximum_non_temporal_threshold);
> > -  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
> > -                        memset_non_temporal_threshold,
> > -                        minimum_non_temporal_threshold, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> >                          minimum_rep_movsb_threshold, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..5c923b3dcb 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >  BIT (MathVec_Prefer_No_AVX512)
> >  BIT (Prefer_FSRM)
> >  BIT (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Prefer_Non_Temporal)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..aec852770a 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> >      /* Disable everything.  */
> >      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > -    "-AVX_Fast_Unaligned_Load",
> > +    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
> >      test_1,
> >      array_length (test_1)
> >    },
> > @@ -68,7 +68,7 @@ static const struct test_t
> >      /* Same as before, but with some empty suboptions.  */
> >      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > +    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
> >      test_1,
> >      array_length (test_1)
> >    }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > index 7a637ef7ca..27f04e9dce 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > @@ -61,7 +61,8 @@ IFUNC_SELECTOR (void)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (avx512_unaligned_erms);
> >
> >         return OPTIMIZE (avx512_unaligned);
> > @@ -76,7 +77,8 @@ IFUNC_SELECTOR (void)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> >         && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (evex_unaligned_erms);
> >
> >         return OPTIMIZE (evex_unaligned);
> > @@ -84,7 +86,8 @@ IFUNC_SELECTOR (void)
> >
> >        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (avx2_unaligned_erms_rtm);
> >
> >         return OPTIMIZE (avx2_unaligned_rtm);
> > @@ -93,14 +96,16 @@ IFUNC_SELECTOR (void)
> >        if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> >                                      Prefer_No_VZEROUPPER, !))
> >       {
> > -       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +           || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >           return OPTIMIZE (avx2_unaligned_erms);
> >
> >         return OPTIMIZE (avx2_unaligned);
> >       }
> >      }
> >
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > +      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
> >      return OPTIMIZE (sse2_unaligned_erms);
> >
> >    return OPTIMIZE (sse2_unaligned);
>
> Should we do similar checks for ifunc-memmove.h as well since they also
> have non-temporal code paths?

At the moment __memmove_vec_unaligned_erms is implemented w/ a path
for non-temporal stores that doesn't go through ERMS codes.

If that changes, however, we should update this.
  

Patch

diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 89da7a03da..8f6032b510 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -223,6 +223,8 @@  TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 						Fast_Unaligned_Load, 19);
 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
 						Fast_Unaligned_Copy, 19);
+	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+						Prefer_Non_Temporal, 19);
 	    }
 	  break;
 	case 20:
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5e77345a6e..2cb70a0d0d 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1041,14 +1041,36 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
        slightly better than ERMS.  */
     rep_stosb_threshold = SIZE_MAX;
 
+  /*
+   * For memset, the non-temporal implementation is only accessed through the
+   * stosb code. ie:
+   * ```
+   * if (size >= rep_stosb_thresh)
+   * {
+   *	if (size >= non_temporal_thresh)
+   * {
+   * do_non_temporal ();
+   * }
+   *	do_stosb ();
+   * }
+   * do_normal_vec_loop ();
+   * ```
+   * So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+   * to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+   *`rep stosb` will never be used.
+   */
+  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+			   memset_non_temporal_threshold,
+			   minimum_non_temporal_threshold, SIZE_MAX);
+  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
+    rep_stosb_threshold
+	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 			   minimum_non_temporal_threshold,
 			   maximum_non_temporal_threshold);
-  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
-			   memset_non_temporal_threshold,
-			   minimum_non_temporal_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 			   minimum_rep_movsb_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 85e7f54ec8..5c923b3dcb 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@  BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Prefer_Non_Temporal)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 8589a9fd66..aec852770a 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,7 @@  static const struct test_t
     /* Disable everything.  */
     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
-    "-AVX_Fast_Unaligned_Load",
+    "-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal",
     test_1,
     array_length (test_1)
   },
@@ -68,7 +68,7 @@  static const struct test_t
     /* Same as before, but with some empty suboptions.  */
     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
-    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Prefer_Non_Temporal,-,",
     test_1,
     array_length (test_1)
   }
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 7a637ef7ca..27f04e9dce 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -61,7 +61,8 @@  IFUNC_SELECTOR (void)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (avx512_unaligned_erms);
 
 	  return OPTIMIZE (avx512_unaligned);
@@ -76,7 +77,8 @@  IFUNC_SELECTOR (void)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (evex_unaligned_erms);
 
 	  return OPTIMIZE (evex_unaligned);
@@ -84,7 +86,8 @@  IFUNC_SELECTOR (void)
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (avx2_unaligned_erms_rtm);
 
 	  return OPTIMIZE (avx2_unaligned_rtm);
@@ -93,14 +96,16 @@  IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
 				       Prefer_No_VZEROUPPER, !))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
 	    return OPTIMIZE (avx2_unaligned_erms);
 
 	  return OPTIMIZE (avx2_unaligned);
 	}
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+      || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal))
     return OPTIMIZE (sse2_unaligned_erms);
 
   return OPTIMIZE (sse2_unaligned);