[v2,2/2] x86: Add seperate non-temporal tunable for memset

Message ID 20240524173851.2483952-2-goldstein.w.n@gmail.com
State Committed
Commit 46b5e98ef6f1b9f4b53851f152ecb8209064b26c
Delegated to: Arjun Shankar
Headers
Series [v2,1/2] x86: Improve large memset perf with non-temporal stores [RHEL-29312] |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed

Commit Message

Noah Goldstein May 24, 2024, 5:38 p.m. UTC
  The tuning for non-temporal stores for memset vs memcpy is not always
the same. This includes both the exact value and whether non-temporal
stores are profitable at all for a given arch.

This patch add `x86_memset_non_temporal_threshold`. Currently we
disable non-temporal stores for non Intel vendors as the only
benchmarks showing its benefit have been on Intel hardware.
---
 manual/tunables.texi                             | 16 +++++++++++++++-
 sysdeps/x86/cacheinfo.h                          |  8 +++++++-
 sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
 sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
 sysdeps/x86/dl-tunables.list                     |  3 +++
 sysdeps/x86/include/cpu-features.h               |  4 +++-
 .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
 7 files changed, 49 insertions(+), 6 deletions(-)
  

Comments

Noah Goldstein May 29, 2024, 4:19 p.m. UTC | #1
On Fri, May 24, 2024 at 12:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The tuning for non-temporal stores for memset vs memcpy is not always
> the same. This includes both the exact value and whether non-temporal
> stores are profitable at all for a given arch.
>
> This patch add `x86_memset_non_temporal_threshold`. Currently we
> disable non-temporal stores for non Intel vendors as the only
> benchmarks showing its benefit have been on Intel hardware.
> ---
>  manual/tunables.texi                             | 16 +++++++++++++++-
>  sysdeps/x86/cacheinfo.h                          |  8 +++++++-
>  sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
>  sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
>  sysdeps/x86/dl-tunables.list                     |  3 +++
>  sysdeps/x86/include/cpu-features.h               |  4 +++-
>  .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
>  7 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index baaf751721..8dd02d8149 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
>  glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
>  glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
>  glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
> +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
>  glibc.cpu.x86_shstk:
>  glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
>  glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
> @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages.
>  @cindex shared_cache_size tunables
>  @cindex tunables, shared_cache_size
>  @cindex non_temporal_threshold tunables
> -@cindex tunables, non_temporal_threshold
> +@cindex memset_non_temporal_threshold tunables
> +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
>
>  @deftp {Tunable namespace} glibc.cpu
>  Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
> @@ -574,6 +576,18 @@ like memmove and memcpy.
>  This tunable is specific to i386 and x86-64.
>  @end deftp
>
> +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
> +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
> +the user to set threshold in bytes for non temporal store in
> +memset. Non temporal stores give a hint to the hardware to move data
> +directly to memory without displacing other data from the cache. This
> +tunable is used by some platforms to determine when to use non
> +temporal stores memset.
> +
> +This tunable is specific to i386 and x86-64.
> +@end deftp
> +
> +
>  @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
>  The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
>  set threshold in bytes to start using "rep movsb".  The value must be
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index ab73556772..83491607c7 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
>  long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
>  long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
>
> -/* Threshold to use non temporal store.  */
> +/* Threshold to use non temporal store in memmove.  */
>  long int __x86_shared_non_temporal_threshold attribute_hidden;
>
> +/* Threshold to use non temporal store in memset.  */
> +long int __x86_memset_non_temporal_threshold attribute_hidden;
> +
>  /* Threshold to use Enhanced REP MOVSB.  */
>  long int __x86_rep_movsb_threshold attribute_hidden = 2048;
>
> @@ -77,6 +80,9 @@ init_cacheinfo (void)
>    __x86_shared_non_temporal_threshold
>      = cpu_features->non_temporal_threshold;
>
> +  __x86_memset_non_temporal_threshold
> +      = cpu_features->memset_non_temporal_threshold;
> +
>    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5a98f70364..d375a7cba6 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>
> +  /* Non-temporal stores in memset have only been tested on Intel hardware.
> +     Until we benchmark data on other x86 processor, disable non-temporal
> +     stores in memset. */
> +  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> +  if (cpu_features->basic.kind == arch_kind_intel)
> +      memset_non_temporal_threshold = non_temporal_threshold;
> +
>     /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>        cases slower than the vectorized path (and for some alignments,
>        it is really slow, check BZ #30994).  */
> @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>        && tunable_size <= maximum_non_temporal_threshold)
>      non_temporal_threshold = tunable_size;
>
> +  tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +  if (tunable_size > minimum_non_temporal_threshold
> +      && tunable_size <= maximum_non_temporal_threshold)
> +    memset_non_temporal_threshold = tunable_size;
> +
>    tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
>    if (tunable_size > minimum_rep_movsb_threshold)
>      rep_movsb_threshold = tunable_size;
> @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
>                            minimum_non_temporal_threshold,
>                            maximum_non_temporal_threshold);
> +  TUNABLE_SET_WITH_BOUNDS (
> +      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
> +      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
>                            minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    cpu_features->data_cache_size = data;
>    cpu_features->shared_cache_size = shared;
>    cpu_features->non_temporal_threshold = non_temporal_threshold;
> +  cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
>    cpu_features->rep_movsb_threshold = rep_movsb_threshold;
>    cpu_features->rep_stosb_threshold = rep_stosb_threshold;
>    cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
> index ceafde9481..49eeb5f70a 100644
> --- a/sysdeps/x86/dl-diagnostics-cpu.c
> +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void)
>                              cpu_features->shared_cache_size);
>    print_cpu_features_value ("non_temporal_threshold",
>                              cpu_features->non_temporal_threshold);
> +  print_cpu_features_value ("memset_non_temporal_threshold",
> +                            cpu_features->memset_non_temporal_threshold);
>    print_cpu_features_value ("rep_movsb_threshold",
>                              cpu_features->rep_movsb_threshold);
>    print_cpu_features_value ("rep_movsb_stop_threshold",
> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> index 7d82da0dec..a0a1299592 100644
> --- a/sysdeps/x86/dl-tunables.list
> +++ b/sysdeps/x86/dl-tunables.list
> @@ -30,6 +30,9 @@ glibc {
>      x86_non_temporal_threshold {
>        type: SIZE_T
>      }
> +    x86_memset_non_temporal_threshold {
> +      type: SIZE_T
> +    }
>      x86_rep_movsb_threshold {
>        type: SIZE_T
>        # Since there is overhead to set up REP MOVSB operation, REP
> diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
> index cd7bd27cf3..aaae44f0e1 100644
> --- a/sysdeps/x86/include/cpu-features.h
> +++ b/sysdeps/x86/include/cpu-features.h
> @@ -944,8 +944,10 @@ struct cpu_features
>    /* Shared cache size for use in memory and string routines, typically
>       L2 or L3 size.  */
>    unsigned long int shared_cache_size;
> -  /* Threshold to use non temporal store.  */
> +  /* Threshold to use non temporal store in memmove.  */
>    unsigned long int non_temporal_threshold;
> +  /* Threshold to use non temporal store in memset.  */
> +  unsigned long int memset_non_temporal_threshold;
>    /* Threshold to use "rep movsb".  */
>    unsigned long int rep_movsb_threshold;
>    /* Threshold to stop using "rep movsb".  */
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 637caadb40..88bf08e4f4 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -24,9 +24,9 @@
>     5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
>        4 VEC stores and store 4 * VEC at a time until done.
>     6. On machines ERMS feature, if size is range
> -         [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
> +         [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
>           then REP STOSB will be used.
> -   7. If size >= __x86_shared_non_temporal_threshold, use a
> +   7. If size >= __x86_memset_non_temporal_threshold, use a
>           non-temporal stores.  */
>
>  #include <sysdep.h>
> @@ -318,7 +318,7 @@ L(return_vzeroupper):
>         /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
>            range for 2-byte jump encoding.  */
>  L(stosb_local):
> -       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       cmp     __x86_memset_non_temporal_threshold(%rip), %RDX_LP
>         jae     L(nt_memset)
>         movzbl  %sil, %eax
>         mov     %RDX_LP, %RCX_LP
> --
> 2.34.1
>
ping
  
H.J. Lu May 29, 2024, 10:53 p.m. UTC | #2
On Fri, May 24, 2024 at 10:39 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The tuning for non-temporal stores for memset vs memcpy is not always
> the same. This includes both the exact value and whether non-temporal
> stores are profitable at all for a given arch.
>
> This patch add `x86_memset_non_temporal_threshold`. Currently we
> disable non-temporal stores for non Intel vendors as the only
> benchmarks showing its benefit have been on Intel hardware.
> ---
>  manual/tunables.texi                             | 16 +++++++++++++++-
>  sysdeps/x86/cacheinfo.h                          |  8 +++++++-
>  sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
>  sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
>  sysdeps/x86/dl-tunables.list                     |  3 +++
>  sysdeps/x86/include/cpu-features.h               |  4 +++-
>  .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
>  7 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index baaf751721..8dd02d8149 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
>  glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
>  glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
>  glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
> +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
>  glibc.cpu.x86_shstk:
>  glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
>  glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
> @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages.
>  @cindex shared_cache_size tunables
>  @cindex tunables, shared_cache_size
>  @cindex non_temporal_threshold tunables
> -@cindex tunables, non_temporal_threshold
> +@cindex memset_non_temporal_threshold tunables
> +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
>
>  @deftp {Tunable namespace} glibc.cpu
>  Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
> @@ -574,6 +576,18 @@ like memmove and memcpy.
>  This tunable is specific to i386 and x86-64.
>  @end deftp
>
> +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
> +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
> +the user to set threshold in bytes for non temporal store in
> +memset. Non temporal stores give a hint to the hardware to move data
> +directly to memory without displacing other data from the cache. This
> +tunable is used by some platforms to determine when to use non
> +temporal stores memset.
> +
> +This tunable is specific to i386 and x86-64.
> +@end deftp
> +
> +
>  @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
>  The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
>  set threshold in bytes to start using "rep movsb".  The value must be
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index ab73556772..83491607c7 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
>  long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
>  long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
>
> -/* Threshold to use non temporal store.  */
> +/* Threshold to use non temporal store in memmove.  */
>  long int __x86_shared_non_temporal_threshold attribute_hidden;
>
> +/* Threshold to use non temporal store in memset.  */
> +long int __x86_memset_non_temporal_threshold attribute_hidden;
> +
>  /* Threshold to use Enhanced REP MOVSB.  */
>  long int __x86_rep_movsb_threshold attribute_hidden = 2048;
>
> @@ -77,6 +80,9 @@ init_cacheinfo (void)
>    __x86_shared_non_temporal_threshold
>      = cpu_features->non_temporal_threshold;
>
> +  __x86_memset_non_temporal_threshold
> +      = cpu_features->memset_non_temporal_threshold;
> +
>    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5a98f70364..d375a7cba6 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>
> +  /* Non-temporal stores in memset have only been tested on Intel hardware.
> +     Until we benchmark data on other x86 processor, disable non-temporal
> +     stores in memset. */
> +  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> +  if (cpu_features->basic.kind == arch_kind_intel)
> +      memset_non_temporal_threshold = non_temporal_threshold;
> +
>     /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>        cases slower than the vectorized path (and for some alignments,
>        it is really slow, check BZ #30994).  */
> @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>        && tunable_size <= maximum_non_temporal_threshold)
>      non_temporal_threshold = tunable_size;
>
> +  tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> +  if (tunable_size > minimum_non_temporal_threshold
> +      && tunable_size <= maximum_non_temporal_threshold)
> +    memset_non_temporal_threshold = tunable_size;
> +
>    tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
>    if (tunable_size > minimum_rep_movsb_threshold)
>      rep_movsb_threshold = tunable_size;
> @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
>                            minimum_non_temporal_threshold,
>                            maximum_non_temporal_threshold);
> +  TUNABLE_SET_WITH_BOUNDS (
> +      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
> +      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
>                            minimum_rep_movsb_threshold, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    cpu_features->data_cache_size = data;
>    cpu_features->shared_cache_size = shared;
>    cpu_features->non_temporal_threshold = non_temporal_threshold;
> +  cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
>    cpu_features->rep_movsb_threshold = rep_movsb_threshold;
>    cpu_features->rep_stosb_threshold = rep_stosb_threshold;
>    cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
> index ceafde9481..49eeb5f70a 100644
> --- a/sysdeps/x86/dl-diagnostics-cpu.c
> +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void)
>                              cpu_features->shared_cache_size);
>    print_cpu_features_value ("non_temporal_threshold",
>                              cpu_features->non_temporal_threshold);
> +  print_cpu_features_value ("memset_non_temporal_threshold",
> +                            cpu_features->memset_non_temporal_threshold);
>    print_cpu_features_value ("rep_movsb_threshold",
>                              cpu_features->rep_movsb_threshold);
>    print_cpu_features_value ("rep_movsb_stop_threshold",
> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> index 7d82da0dec..a0a1299592 100644
> --- a/sysdeps/x86/dl-tunables.list
> +++ b/sysdeps/x86/dl-tunables.list
> @@ -30,6 +30,9 @@ glibc {
>      x86_non_temporal_threshold {
>        type: SIZE_T
>      }
> +    x86_memset_non_temporal_threshold {
> +      type: SIZE_T
> +    }
>      x86_rep_movsb_threshold {
>        type: SIZE_T
>        # Since there is overhead to set up REP MOVSB operation, REP
> diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
> index cd7bd27cf3..aaae44f0e1 100644
> --- a/sysdeps/x86/include/cpu-features.h
> +++ b/sysdeps/x86/include/cpu-features.h
> @@ -944,8 +944,10 @@ struct cpu_features
>    /* Shared cache size for use in memory and string routines, typically
>       L2 or L3 size.  */
>    unsigned long int shared_cache_size;
> -  /* Threshold to use non temporal store.  */
> +  /* Threshold to use non temporal store in memmove.  */
>    unsigned long int non_temporal_threshold;
> +  /* Threshold to use non temporal store in memset.  */
> +  unsigned long int memset_non_temporal_threshold;
>    /* Threshold to use "rep movsb".  */
>    unsigned long int rep_movsb_threshold;
>    /* Threshold to stop using "rep movsb".  */
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 637caadb40..88bf08e4f4 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -24,9 +24,9 @@
>     5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
>        4 VEC stores and store 4 * VEC at a time until done.
>     6. On machines ERMS feature, if size is range
> -         [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
> +         [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
>           then REP STOSB will be used.
> -   7. If size >= __x86_shared_non_temporal_threshold, use a
> +   7. If size >= __x86_memset_non_temporal_threshold, use a
>           non-temporal stores.  */
>
>  #include <sysdep.h>
> @@ -318,7 +318,7 @@ L(return_vzeroupper):
>         /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
>            range for 2-byte jump encoding.  */
>  L(stosb_local):
> -       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       cmp     __x86_memset_non_temporal_threshold(%rip), %RDX_LP
>         jae     L(nt_memset)
>         movzbl  %sil, %eax
>         mov     %RDX_LP, %RCX_LP
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Borislav Petkov June 14, 2024, 10:40 a.m. UTC | #3
Hi,

I'm not subscribed to the glibc list - pls CC me directly on replies.

On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The tuning for non-temporal stores for memset vs memcpy is not always
> > the same. This includes both the exact value and whether non-temporal
> > stores are profitable at all for a given arch.
> >
> > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > disable non-temporal stores for non Intel vendors as the only
> > benchmarks showing its benefit have been on Intel hardware.
> > ---
> >  manual/tunables.texi                             | 16 +++++++++++++++-
> >  sysdeps/x86/cacheinfo.h                          |  8 +++++++-
> >  sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
> >  sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
> >  sysdeps/x86/dl-tunables.list                     |  3 +++
> >  sysdeps/x86/include/cpu-features.h               |  4 +++-
> >  .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
> >  7 files changed, 49 insertions(+), 6 deletions(-)

...

> > +  /* Non-temporal stores in memset have only been tested on Intel hardware.
> > +     Until we benchmark data on other x86 processor, disable non-temporal
> > +     stores in memset. */

Well, something's fishy here:

$ ./elf/ld.so --list-tunables | grep threshold
glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
					    ^^^^^^^^^

on glibc-2.39.9000-300-g54c1efdac55b from git.

That's on a AMD Zen1 so I'd expect that memset NT threshold to be
0xffffffffffffffff by default...

Thx.
  
Noah Goldstein June 14, 2024, 4:39 p.m. UTC | #4
On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
>
> Hi,
>
> I'm not subscribed to the glibc list - pls CC me directly on replies.
>
> On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The tuning for non-temporal stores for memset vs memcpy is not always
> > > the same. This includes both the exact value and whether non-temporal
> > > stores are profitable at all for a given arch.
> > >
> > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > disable non-temporal stores for non Intel vendors as the only
> > > benchmarks showing its benefit have been on Intel hardware.
> > > ---
> > >  manual/tunables.texi                             | 16 +++++++++++++++-
> > >  sysdeps/x86/cacheinfo.h                          |  8 +++++++-
> > >  sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
> > >  sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
> > >  sysdeps/x86/dl-tunables.list                     |  3 +++
> > >  sysdeps/x86/include/cpu-features.h               |  4 +++-
> > >  .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
> > >  7 files changed, 49 insertions(+), 6 deletions(-)
>
> ...
>
> > > +  /* Non-temporal stores in memset have only been tested on Intel hardware.
> > > +     Until we benchmark data on other x86 processor, disable non-temporal
> > > +     stores in memset. */
>
> Well, something's fishy here:
>
> $ ./elf/ld.so --list-tunables | grep threshold
> glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
> glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
> glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
> glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
>                                             ^^^^^^^^^
>
> on glibc-2.39.9000-300-g54c1efdac55b from git.
>
> That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> 0xffffffffffffffff by default...
>
> Thx.
>

Thanks for bringing this up, looking into it.
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
  
Borislav Petkov June 14, 2024, 6:01 p.m. UTC | #5
On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote:
> On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
> >
> > Hi,
> >
> > I'm not subscribed to the glibc list - pls CC me directly on replies.
> >
> > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > The tuning for non-temporal stores for memset vs memcpy is not always
> > > > the same. This includes both the exact value and whether non-temporal
> > > > stores are profitable at all for a given arch.
> > > >
> > > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > > disable non-temporal stores for non Intel vendors as the only
> > > > benchmarks showing its benefit have been on Intel hardware.
> > > > ---
> > > >  manual/tunables.texi                             | 16 +++++++++++++++-
> > > >  sysdeps/x86/cacheinfo.h                          |  8 +++++++-
> > > >  sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
> > > >  sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
> > > >  sysdeps/x86/dl-tunables.list                     |  3 +++
> > > >  sysdeps/x86/include/cpu-features.h               |  4 +++-
> > > >  .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
> > > >  7 files changed, 49 insertions(+), 6 deletions(-)
> >
> > ...
> >
> > > > +  /* Non-temporal stores in memset have only been tested on Intel hardware.
> > > > +     Until we benchmark data on other x86 processor, disable non-temporal
> > > > +     stores in memset. */
> >
> > Well, something's fishy here:
> >
> > $ ./elf/ld.so --list-tunables | grep threshold
> > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
> > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
> > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
> > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> >                                             ^^^^^^^^^
> >
> > on glibc-2.39.9000-300-g54c1efdac55b from git.
> >
> > That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> > 0xffffffffffffffff by default...
> >
> > Thx.
> >
> 
> Thanks for bringing this up, looking into it.

Thx, so Michael did debug it yesterday to the ranges mismatching:

diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
index 147cc4cf23f5..ecf3c1d3736e 100644
--- a/elf/dl-tunables.c
+++ b/elf/dl-tunables.c
@@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp,
 
   /* Bail out if the bounds are not valid.  */
   if (tunable_val_lt (val, min, unsigned_cmp)
-      || tunable_val_lt (max, val, unsigned_cmp))
+      || tunable_val_lt (max, val, unsigned_cmp)) {
+         _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n",
+                    val, min, max);
     return;
+  }
 
   cur->val.numval = val;
   cur->type.min = min;

$ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)"
dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff
dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size: 0xffffffffffffffff
bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
^^^^^^^

dl_init_cacheinfo: memset_non_temporal_threshold, tunable set: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)

but you guys probably should do the right fix here.

Thx.
  
Noah Goldstein June 14, 2024, 6:02 p.m. UTC | #6
On Fri, Jun 14, 2024 at 1:01 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote:
> > On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > Hi,
> > >
> > > I'm not subscribed to the glibc list - pls CC me directly on replies.
> > >
> > > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > The tuning for non-temporal stores for memset vs memcpy is not always
> > > > > the same. This includes both the exact value and whether non-temporal
> > > > > stores are profitable at all for a given arch.
> > > > >
> > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > > > disable non-temporal stores for non Intel vendors as the only
> > > > > benchmarks showing its benefit have been on Intel hardware.
> > > > > ---
> > > > >  manual/tunables.texi                             | 16 +++++++++++++++-
> > > > >  sysdeps/x86/cacheinfo.h                          |  8 +++++++-
> > > > >  sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
> > > > >  sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
> > > > >  sysdeps/x86/dl-tunables.list                     |  3 +++
> > > > >  sysdeps/x86/include/cpu-features.h               |  4 +++-
> > > > >  .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
> > > > >  7 files changed, 49 insertions(+), 6 deletions(-)
> > >
> > > ...
> > >
> > > > > +  /* Non-temporal stores in memset have only been tested on Intel hardware.
> > > > > +     Until we benchmark data on other x86 processor, disable non-temporal
> > > > > +     stores in memset. */
> > >
> > > Well, something's fishy here:
> > >
> > > $ ./elf/ld.so --list-tunables | grep threshold
> > > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
> > > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > >                                             ^^^^^^^^^
> > >
> > > on glibc-2.39.9000-300-g54c1efdac55b from git.
> > >
> > > That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> > > 0xffffffffffffffff by default...
> > >
> > > Thx.
> > >
> >
> > Thanks for bringing this up, looking into it.
>
> Thx, so Michael did debug it yesterday to the ranges mismatching:
>
> diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
> index 147cc4cf23f5..ecf3c1d3736e 100644
> --- a/elf/dl-tunables.c
> +++ b/elf/dl-tunables.c
> @@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp,
>
>    /* Bail out if the bounds are not valid.  */
>    if (tunable_val_lt (val, min, unsigned_cmp)
> -      || tunable_val_lt (max, val, unsigned_cmp))
> +      || tunable_val_lt (max, val, unsigned_cmp)) {
> +         _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n",
> +                    val, min, max);
>      return;
> +  }
>
>    cur->val.numval = val;
>    cur->type.min = min;
>
> $ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)"
> dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff
> dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size: 0xffffffffffffffff
> bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
> ^^^^^^^
>
> dl_init_cacheinfo: memset_non_temporal_threshold, tunable set: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
> glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
>
> but you guys probably should do the right fix here.

Just posted the fix, you should be CCd on it.
>
> Thx.
>
> --
> Regards/Gruss,
>     Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
  

Patch

diff --git a/manual/tunables.texi b/manual/tunables.texi
index baaf751721..8dd02d8149 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -52,6 +52,7 @@  glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
 glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
 glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
 glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
 glibc.cpu.x86_shstk:
 glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
 glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -495,7 +496,8 @@  thread stack originally backup by Huge Pages to default pages.
 @cindex shared_cache_size tunables
 @cindex tunables, shared_cache_size
 @cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
 
 @deftp {Tunable namespace} glibc.cpu
 Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -574,6 +576,18 @@  like memmove and memcpy.
 This tunable is specific to i386 and x86-64.
 @end deftp
 
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
 @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
 The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
 set threshold in bytes to start using "rep movsb".  The value must be
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index ab73556772..83491607c7 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -35,9 +35,12 @@  long int __x86_data_cache_size attribute_hidden = 32 * 1024;
 long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
 long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
 
-/* Threshold to use non temporal store.  */
+/* Threshold to use non temporal store in memmove.  */
 long int __x86_shared_non_temporal_threshold attribute_hidden;
 
+/* Threshold to use non temporal store in memset.  */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
 /* Threshold to use Enhanced REP MOVSB.  */
 long int __x86_rep_movsb_threshold attribute_hidden = 2048;
 
@@ -77,6 +80,9 @@  init_cacheinfo (void)
   __x86_shared_non_temporal_threshold
     = cpu_features->non_temporal_threshold;
 
+  __x86_memset_non_temporal_threshold
+      = cpu_features->memset_non_temporal_threshold;
+
   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5a98f70364..d375a7cba6 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -986,6 +986,13 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
+  /* Non-temporal stores in memset have only been tested on Intel hardware.
+     Until we benchmark data on other x86 processor, disable non-temporal
+     stores in memset. */
+  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+  if (cpu_features->basic.kind == arch_kind_intel)
+      memset_non_temporal_threshold = non_temporal_threshold;
+
    /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
       cases slower than the vectorized path (and for some alignments,
       it is really slow, check BZ #30994).  */
@@ -1012,6 +1019,11 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
       && tunable_size <= maximum_non_temporal_threshold)
     non_temporal_threshold = tunable_size;
 
+  tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+  if (tunable_size > minimum_non_temporal_threshold
+      && tunable_size <= maximum_non_temporal_threshold)
+    memset_non_temporal_threshold = tunable_size;
+
   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
   if (tunable_size > minimum_rep_movsb_threshold)
     rep_movsb_threshold = tunable_size;
@@ -1032,6 +1044,9 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 			   minimum_non_temporal_threshold,
 			   maximum_non_temporal_threshold);
+  TUNABLE_SET_WITH_BOUNDS (
+      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 			   minimum_rep_movsb_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1045,6 +1060,7 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;
   cpu_features->non_temporal_threshold = non_temporal_threshold;
+  cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index ceafde9481..49eeb5f70a 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -94,6 +94,8 @@  _dl_diagnostics_cpu (void)
                             cpu_features->shared_cache_size);
   print_cpu_features_value ("non_temporal_threshold",
                             cpu_features->non_temporal_threshold);
+  print_cpu_features_value ("memset_non_temporal_threshold",
+                            cpu_features->memset_non_temporal_threshold);
   print_cpu_features_value ("rep_movsb_threshold",
                             cpu_features->rep_movsb_threshold);
   print_cpu_features_value ("rep_movsb_stop_threshold",
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index 7d82da0dec..a0a1299592 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,9 @@  glibc {
     x86_non_temporal_threshold {
       type: SIZE_T
     }
+    x86_memset_non_temporal_threshold {
+      type: SIZE_T
+    }
     x86_rep_movsb_threshold {
       type: SIZE_T
       # Since there is overhead to set up REP MOVSB operation, REP
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index cd7bd27cf3..aaae44f0e1 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -944,8 +944,10 @@  struct cpu_features
   /* Shared cache size for use in memory and string routines, typically
      L2 or L3 size.  */
   unsigned long int shared_cache_size;
-  /* Threshold to use non temporal store.  */
+  /* Threshold to use non temporal store in memmove.  */
   unsigned long int non_temporal_threshold;
+  /* Threshold to use non temporal store in memset.  */
+  unsigned long int memset_non_temporal_threshold;
   /* Threshold to use "rep movsb".  */
   unsigned long int rep_movsb_threshold;
   /* Threshold to stop using "rep movsb".  */
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 637caadb40..88bf08e4f4 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -24,9 +24,9 @@ 
    5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
       4 VEC stores and store 4 * VEC at a time until done.
    6. On machines ERMS feature, if size is range
-	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+	  [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
 	  then REP STOSB will be used.
-   7. If size >= __x86_shared_non_temporal_threshold, use a
+   7. If size >= __x86_memset_non_temporal_threshold, use a
 	  non-temporal stores.  */
 
 #include <sysdep.h>
@@ -318,7 +318,7 @@  L(return_vzeroupper):
 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 	   range for 2-byte jump encoding.  */
 L(stosb_local):
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_memset_non_temporal_threshold(%rip), %RDX_LP
 	jae	L(nt_memset)
 	movzbl	%sil, %eax
 	mov	%RDX_LP, %RCX_LP