[v2,2/2] x86: Add seperate non-temporal tunable for memset
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Testing passed
|
Commit Message
The tuning for non-temporal stores for memset vs memcpy is not always
the same. This includes both the exact value and whether non-temporal
stores are profitable at all for a given arch.
This patch add `x86_memset_non_temporal_threshold`. Currently we
disable non-temporal stores for non Intel vendors as the only
benchmarks showing its benefit have been on Intel hardware.
---
manual/tunables.texi | 16 +++++++++++++++-
sysdeps/x86/cacheinfo.h | 8 +++++++-
sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
sysdeps/x86/dl-tunables.list | 3 +++
sysdeps/x86/include/cpu-features.h | 4 +++-
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
7 files changed, 49 insertions(+), 6 deletions(-)
Comments
On Fri, May 24, 2024 at 12:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The tuning for non-temporal stores for memset vs memcpy is not always
> the same. This includes both the exact value and whether non-temporal
> stores are profitable at all for a given arch.
>
> This patch add `x86_memset_non_temporal_threshold`. Currently we
> disable non-temporal stores for non Intel vendors as the only
> benchmarks showing its benefit have been on Intel hardware.
> ---
> manual/tunables.texi | 16 +++++++++++++++-
> sysdeps/x86/cacheinfo.h | 8 +++++++-
> sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
> sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> sysdeps/x86/dl-tunables.list | 3 +++
> sysdeps/x86/include/cpu-features.h | 4 +++-
> .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> 7 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index baaf751721..8dd02d8149 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
> glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
> glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
> +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
> glibc.cpu.x86_shstk:
> glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
> @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages.
> @cindex shared_cache_size tunables
> @cindex tunables, shared_cache_size
> @cindex non_temporal_threshold tunables
> -@cindex tunables, non_temporal_threshold
> +@cindex memset_non_temporal_threshold tunables
> +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
>
> @deftp {Tunable namespace} glibc.cpu
> Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
> @@ -574,6 +576,18 @@ like memmove and memcpy.
> This tunable is specific to i386 and x86-64.
> @end deftp
>
> +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
> +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
> +the user to set threshold in bytes for non temporal store in
> +memset. Non temporal stores give a hint to the hardware to move data
> +directly to memory without displacing other data from the cache. This
> +tunable is used by some platforms to determine when to use non
> +temporal stores memset.
> +
> +This tunable is specific to i386 and x86-64.
> +@end deftp
> +
> +
> @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
> The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
> set threshold in bytes to start using "rep movsb". The value must be
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index ab73556772..83491607c7 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
> long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
> long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
>
> -/* Threshold to use non temporal store. */
> +/* Threshold to use non temporal store in memmove. */
> long int __x86_shared_non_temporal_threshold attribute_hidden;
>
> +/* Threshold to use non temporal store in memset. */
> +long int __x86_memset_non_temporal_threshold attribute_hidden;
> +
> /* Threshold to use Enhanced REP MOVSB. */
> long int __x86_rep_movsb_threshold attribute_hidden = 2048;
>
> @@ -77,6 +80,9 @@ init_cacheinfo (void)
> __x86_shared_non_temporal_threshold
> = cpu_features->non_temporal_threshold;
>
> + __x86_memset_non_temporal_threshold
> + = cpu_features->memset_non_temporal_threshold;
> +
> __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
> __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5a98f70364..d375a7cba6 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> rep_movsb_threshold = 2112;
>
> + /* Non-temporal stores in memset have only been tested on Intel hardware.
> + Until we benchmark data on other x86 processor, disable non-temporal
> + stores in memset. */
> + unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> + if (cpu_features->basic.kind == arch_kind_intel)
> + memset_non_temporal_threshold = non_temporal_threshold;
> +
> /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> cases slower than the vectorized path (and for some alignments,
> it is really slow, check BZ #30994). */
> @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> && tunable_size <= maximum_non_temporal_threshold)
> non_temporal_threshold = tunable_size;
>
> + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> + if (tunable_size > minimum_non_temporal_threshold
> + && tunable_size <= maximum_non_temporal_threshold)
> + memset_non_temporal_threshold = tunable_size;
> +
> tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
> if (tunable_size > minimum_rep_movsb_threshold)
> rep_movsb_threshold = tunable_size;
> @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> minimum_non_temporal_threshold,
> maximum_non_temporal_threshold);
> + TUNABLE_SET_WITH_BOUNDS (
> + x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
> + minimum_non_temporal_threshold, maximum_non_temporal_threshold);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> minimum_rep_movsb_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> cpu_features->data_cache_size = data;
> cpu_features->shared_cache_size = shared;
> cpu_features->non_temporal_threshold = non_temporal_threshold;
> + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
> cpu_features->rep_movsb_threshold = rep_movsb_threshold;
> cpu_features->rep_stosb_threshold = rep_stosb_threshold;
> cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
> index ceafde9481..49eeb5f70a 100644
> --- a/sysdeps/x86/dl-diagnostics-cpu.c
> +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void)
> cpu_features->shared_cache_size);
> print_cpu_features_value ("non_temporal_threshold",
> cpu_features->non_temporal_threshold);
> + print_cpu_features_value ("memset_non_temporal_threshold",
> + cpu_features->memset_non_temporal_threshold);
> print_cpu_features_value ("rep_movsb_threshold",
> cpu_features->rep_movsb_threshold);
> print_cpu_features_value ("rep_movsb_stop_threshold",
> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> index 7d82da0dec..a0a1299592 100644
> --- a/sysdeps/x86/dl-tunables.list
> +++ b/sysdeps/x86/dl-tunables.list
> @@ -30,6 +30,9 @@ glibc {
> x86_non_temporal_threshold {
> type: SIZE_T
> }
> + x86_memset_non_temporal_threshold {
> + type: SIZE_T
> + }
> x86_rep_movsb_threshold {
> type: SIZE_T
> # Since there is overhead to set up REP MOVSB operation, REP
> diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
> index cd7bd27cf3..aaae44f0e1 100644
> --- a/sysdeps/x86/include/cpu-features.h
> +++ b/sysdeps/x86/include/cpu-features.h
> @@ -944,8 +944,10 @@ struct cpu_features
> /* Shared cache size for use in memory and string routines, typically
> L2 or L3 size. */
> unsigned long int shared_cache_size;
> - /* Threshold to use non temporal store. */
> + /* Threshold to use non temporal store in memmove. */
> unsigned long int non_temporal_threshold;
> + /* Threshold to use non temporal store in memset. */
> + unsigned long int memset_non_temporal_threshold;
> /* Threshold to use "rep movsb". */
> unsigned long int rep_movsb_threshold;
> /* Threshold to stop using "rep movsb". */
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 637caadb40..88bf08e4f4 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -24,9 +24,9 @@
> 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
> 4 VEC stores and store 4 * VEC at a time until done.
> 6. On machines ERMS feature, if size is range
> - [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
> + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
> then REP STOSB will be used.
> - 7. If size >= __x86_shared_non_temporal_threshold, use a
> + 7. If size >= __x86_memset_non_temporal_threshold, use a
> non-temporal stores. */
>
> #include <sysdep.h>
> @@ -318,7 +318,7 @@ L(return_vzeroupper):
> /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
> range for 2-byte jump encoding. */
> L(stosb_local):
> - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
> jae L(nt_memset)
> movzbl %sil, %eax
> mov %RDX_LP, %RCX_LP
> --
> 2.34.1
>
ping
On Fri, May 24, 2024 at 10:39 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The tuning for non-temporal stores for memset vs memcpy is not always
> the same. This includes both the exact value and whether non-temporal
> stores are profitable at all for a given arch.
>
> This patch add `x86_memset_non_temporal_threshold`. Currently we
> disable non-temporal stores for non Intel vendors as the only
> benchmarks showing its benefit have been on Intel hardware.
> ---
> manual/tunables.texi | 16 +++++++++++++++-
> sysdeps/x86/cacheinfo.h | 8 +++++++-
> sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
> sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> sysdeps/x86/dl-tunables.list | 3 +++
> sysdeps/x86/include/cpu-features.h | 4 +++-
> .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> 7 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index baaf751721..8dd02d8149 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
> glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
> glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
> +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
> glibc.cpu.x86_shstk:
> glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
> @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages.
> @cindex shared_cache_size tunables
> @cindex tunables, shared_cache_size
> @cindex non_temporal_threshold tunables
> -@cindex tunables, non_temporal_threshold
> +@cindex memset_non_temporal_threshold tunables
> +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
>
> @deftp {Tunable namespace} glibc.cpu
> Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
> @@ -574,6 +576,18 @@ like memmove and memcpy.
> This tunable is specific to i386 and x86-64.
> @end deftp
>
> +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
> +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
> +the user to set threshold in bytes for non temporal store in
> +memset. Non temporal stores give a hint to the hardware to move data
> +directly to memory without displacing other data from the cache. This
> +tunable is used by some platforms to determine when to use non
> +temporal stores memset.
> +
> +This tunable is specific to i386 and x86-64.
> +@end deftp
> +
> +
> @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
> The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
> set threshold in bytes to start using "rep movsb". The value must be
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index ab73556772..83491607c7 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
> long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
> long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
>
> -/* Threshold to use non temporal store. */
> +/* Threshold to use non temporal store in memmove. */
> long int __x86_shared_non_temporal_threshold attribute_hidden;
>
> +/* Threshold to use non temporal store in memset. */
> +long int __x86_memset_non_temporal_threshold attribute_hidden;
> +
> /* Threshold to use Enhanced REP MOVSB. */
> long int __x86_rep_movsb_threshold attribute_hidden = 2048;
>
> @@ -77,6 +80,9 @@ init_cacheinfo (void)
> __x86_shared_non_temporal_threshold
> = cpu_features->non_temporal_threshold;
>
> + __x86_memset_non_temporal_threshold
> + = cpu_features->memset_non_temporal_threshold;
> +
> __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
> __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5a98f70364..d375a7cba6 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> rep_movsb_threshold = 2112;
>
> + /* Non-temporal stores in memset have only been tested on Intel hardware.
> + Until we benchmark data on other x86 processor, disable non-temporal
> + stores in memset. */
> + unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> + if (cpu_features->basic.kind == arch_kind_intel)
> + memset_non_temporal_threshold = non_temporal_threshold;
> +
> /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> cases slower than the vectorized path (and for some alignments,
> it is really slow, check BZ #30994). */
> @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> && tunable_size <= maximum_non_temporal_threshold)
> non_temporal_threshold = tunable_size;
>
> + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
> + if (tunable_size > minimum_non_temporal_threshold
> + && tunable_size <= maximum_non_temporal_threshold)
> + memset_non_temporal_threshold = tunable_size;
> +
> tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
> if (tunable_size > minimum_rep_movsb_threshold)
> rep_movsb_threshold = tunable_size;
> @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
> minimum_non_temporal_threshold,
> maximum_non_temporal_threshold);
> + TUNABLE_SET_WITH_BOUNDS (
> + x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
> + minimum_non_temporal_threshold, maximum_non_temporal_threshold);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
> minimum_rep_movsb_threshold, SIZE_MAX);
> TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> cpu_features->data_cache_size = data;
> cpu_features->shared_cache_size = shared;
> cpu_features->non_temporal_threshold = non_temporal_threshold;
> + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
> cpu_features->rep_movsb_threshold = rep_movsb_threshold;
> cpu_features->rep_stosb_threshold = rep_stosb_threshold;
> cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
> diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
> index ceafde9481..49eeb5f70a 100644
> --- a/sysdeps/x86/dl-diagnostics-cpu.c
> +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void)
> cpu_features->shared_cache_size);
> print_cpu_features_value ("non_temporal_threshold",
> cpu_features->non_temporal_threshold);
> + print_cpu_features_value ("memset_non_temporal_threshold",
> + cpu_features->memset_non_temporal_threshold);
> print_cpu_features_value ("rep_movsb_threshold",
> cpu_features->rep_movsb_threshold);
> print_cpu_features_value ("rep_movsb_stop_threshold",
> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> index 7d82da0dec..a0a1299592 100644
> --- a/sysdeps/x86/dl-tunables.list
> +++ b/sysdeps/x86/dl-tunables.list
> @@ -30,6 +30,9 @@ glibc {
> x86_non_temporal_threshold {
> type: SIZE_T
> }
> + x86_memset_non_temporal_threshold {
> + type: SIZE_T
> + }
> x86_rep_movsb_threshold {
> type: SIZE_T
> # Since there is overhead to set up REP MOVSB operation, REP
> diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
> index cd7bd27cf3..aaae44f0e1 100644
> --- a/sysdeps/x86/include/cpu-features.h
> +++ b/sysdeps/x86/include/cpu-features.h
> @@ -944,8 +944,10 @@ struct cpu_features
> /* Shared cache size for use in memory and string routines, typically
> L2 or L3 size. */
> unsigned long int shared_cache_size;
> - /* Threshold to use non temporal store. */
> + /* Threshold to use non temporal store in memmove. */
> unsigned long int non_temporal_threshold;
> + /* Threshold to use non temporal store in memset. */
> + unsigned long int memset_non_temporal_threshold;
> /* Threshold to use "rep movsb". */
> unsigned long int rep_movsb_threshold;
> /* Threshold to stop using "rep movsb". */
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 637caadb40..88bf08e4f4 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -24,9 +24,9 @@
> 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
> 4 VEC stores and store 4 * VEC at a time until done.
> 6. On machines ERMS feature, if size is range
> - [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
> + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
> then REP STOSB will be used.
> - 7. If size >= __x86_shared_non_temporal_threshold, use a
> + 7. If size >= __x86_memset_non_temporal_threshold, use a
> non-temporal stores. */
>
> #include <sysdep.h>
> @@ -318,7 +318,7 @@ L(return_vzeroupper):
> /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
> range for 2-byte jump encoding. */
> L(stosb_local):
> - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
> jae L(nt_memset)
> movzbl %sil, %eax
> mov %RDX_LP, %RCX_LP
> --
> 2.34.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
Hi,
I'm not subscribed to the glibc list - pls CC me directly on replies.
On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The tuning for non-temporal stores for memset vs memcpy is not always
> > the same. This includes both the exact value and whether non-temporal
> > stores are profitable at all for a given arch.
> >
> > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > disable non-temporal stores for non Intel vendors as the only
> > benchmarks showing its benefit have been on Intel hardware.
> > ---
> > manual/tunables.texi | 16 +++++++++++++++-
> > sysdeps/x86/cacheinfo.h | 8 +++++++-
> > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
> > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> > sysdeps/x86/dl-tunables.list | 3 +++
> > sysdeps/x86/include/cpu-features.h | 4 +++-
> > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> > 7 files changed, 49 insertions(+), 6 deletions(-)
...
> > + /* Non-temporal stores in memset have only been tested on Intel hardware.
> > + Until we benchmark data on other x86 processor, disable non-temporal
> > + stores in memset. */
Well, something's fishy here:
$ ./elf/ld.so --list-tunables | grep threshold
glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
^^^^^^^^^
on glibc-2.39.9000-300-g54c1efdac55b from git.
That's on a AMD Zen1 so I'd expect that memset NT threshold to be
0xffffffffffffffff by default...
Thx.
On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
>
> Hi,
>
> I'm not subscribed to the glibc list - pls CC me directly on replies.
>
> On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The tuning for non-temporal stores for memset vs memcpy is not always
> > > the same. This includes both the exact value and whether non-temporal
> > > stores are profitable at all for a given arch.
> > >
> > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > disable non-temporal stores for non Intel vendors as the only
> > > benchmarks showing its benefit have been on Intel hardware.
> > > ---
> > > manual/tunables.texi | 16 +++++++++++++++-
> > > sysdeps/x86/cacheinfo.h | 8 +++++++-
> > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
> > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> > > sysdeps/x86/dl-tunables.list | 3 +++
> > > sysdeps/x86/include/cpu-features.h | 4 +++-
> > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> > > 7 files changed, 49 insertions(+), 6 deletions(-)
>
> ...
>
> > > + /* Non-temporal stores in memset have only been tested on Intel hardware.
> > > + Until we benchmark data on other x86 processor, disable non-temporal
> > > + stores in memset. */
>
> Well, something's fishy here:
>
> $ ./elf/ld.so --list-tunables | grep threshold
> glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
> glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
> glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
> glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> ^^^^^^^^^
>
> on glibc-2.39.9000-300-g54c1efdac55b from git.
>
> That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> 0xffffffffffffffff by default...
>
> Thx.
>
Thanks for bringing this up, looking into it.
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote:
> On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
> >
> > Hi,
> >
> > I'm not subscribed to the glibc list - pls CC me directly on replies.
> >
> > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > The tuning for non-temporal stores for memset vs memcpy is not always
> > > > the same. This includes both the exact value and whether non-temporal
> > > > stores are profitable at all for a given arch.
> > > >
> > > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > > disable non-temporal stores for non Intel vendors as the only
> > > > benchmarks showing its benefit have been on Intel hardware.
> > > > ---
> > > > manual/tunables.texi | 16 +++++++++++++++-
> > > > sysdeps/x86/cacheinfo.h | 8 +++++++-
> > > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
> > > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> > > > sysdeps/x86/dl-tunables.list | 3 +++
> > > > sysdeps/x86/include/cpu-features.h | 4 +++-
> > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> > > > 7 files changed, 49 insertions(+), 6 deletions(-)
> >
> > ...
> >
> > > > + /* Non-temporal stores in memset have only been tested on Intel hardware.
> > > > + Until we benchmark data on other x86 processor, disable non-temporal
> > > > + stores in memset. */
> >
> > Well, something's fishy here:
> >
> > $ ./elf/ld.so --list-tunables | grep threshold
> > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
> > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
> > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
> > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > ^^^^^^^^^
> >
> > on glibc-2.39.9000-300-g54c1efdac55b from git.
> >
> > That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> > 0xffffffffffffffff by default...
> >
> > Thx.
> >
>
> Thanks for bringing this up, looking into it.
Thx, so Michael did debug it yesterday to the ranges mismatching:
diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
index 147cc4cf23f5..ecf3c1d3736e 100644
--- a/elf/dl-tunables.c
+++ b/elf/dl-tunables.c
@@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp,
/* Bail out if the bounds are not valid. */
if (tunable_val_lt (val, min, unsigned_cmp)
- || tunable_val_lt (max, val, unsigned_cmp))
+ || tunable_val_lt (max, val, unsigned_cmp)) {
+ _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n",
+ val, min, max);
return;
+ }
cur->val.numval = val;
cur->type.min = min;
$ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)"
dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff
dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size: 0xffffffffffffffff
bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
^^^^^^^
dl_init_cacheinfo: memset_non_temporal_threshold, tunable set: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
but you guys probably should do the right fix here.
Thx.
On Fri, Jun 14, 2024 at 1:01 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote:
> > On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > Hi,
> > >
> > > I'm not subscribed to the glibc list - pls CC me directly on replies.
> > >
> > > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > The tuning for non-temporal stores for memset vs memcpy is not always
> > > > > the same. This includes both the exact value and whether non-temporal
> > > > > stores are profitable at all for a given arch.
> > > > >
> > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > > > disable non-temporal stores for non Intel vendors as the only
> > > > > benchmarks showing its benefit have been on Intel hardware.
> > > > > ---
> > > > > manual/tunables.texi | 16 +++++++++++++++-
> > > > > sysdeps/x86/cacheinfo.h | 8 +++++++-
> > > > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
> > > > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> > > > > sysdeps/x86/dl-tunables.list | 3 +++
> > > > > sysdeps/x86/include/cpu-features.h | 4 +++-
> > > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> > > > > 7 files changed, 49 insertions(+), 6 deletions(-)
> > >
> > > ...
> > >
> > > > > + /* Non-temporal stores in memset have only been tested on Intel hardware.
> > > > > + Until we benchmark data on other x86 processor, disable non-temporal
> > > > > + stores in memset. */
> > >
> > > Well, something's fishy here:
> > >
> > > $ ./elf/ld.so --list-tunables | grep threshold
> > > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff)
> > > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff)
> > > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > ^^^^^^^^^
> > >
> > > on glibc-2.39.9000-300-g54c1efdac55b from git.
> > >
> > > That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> > > 0xffffffffffffffff by default...
> > >
> > > Thx.
> > >
> >
> > Thanks for bringing this up, looking into it.
>
> Thx, so Michael did debug it yesterday to the ranges mismatching:
>
> diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
> index 147cc4cf23f5..ecf3c1d3736e 100644
> --- a/elf/dl-tunables.c
> +++ b/elf/dl-tunables.c
> @@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp,
>
> /* Bail out if the bounds are not valid. */
> if (tunable_val_lt (val, min, unsigned_cmp)
> - || tunable_val_lt (max, val, unsigned_cmp))
> + || tunable_val_lt (max, val, unsigned_cmp)) {
> + _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n",
> + val, min, max);
> return;
> + }
>
> cur->val.numval = val;
> cur->type.min = min;
>
> $ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)"
> dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff
> dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size: 0xffffffffffffffff
> bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
> ^^^^^^^
>
> dl_init_cacheinfo: memset_non_temporal_threshold, tunable set: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
> glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
>
> but you guys probably should do the right fix here.
Just posted the fix, you should be CCd on it.
>
> Thx.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
On Fri, Jun 14, 2024 at 11:03 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:
> On Fri, Jun 14, 2024 at 1:01 PM Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote:
> > > On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote:
> > > >
> > > > Hi,
> > > >
> > > > I'm not subscribed to the glibc list - pls CC me directly on replies.
> > > >
> > > > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote:
> > > > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > The tuning for non-temporal stores for memset vs memcpy is not
> always
> > > > > > the same. This includes both the exact value and whether
> non-temporal
> > > > > > stores are profitable at all for a given arch.
> > > > > >
> > > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we
> > > > > > disable non-temporal stores for non Intel vendors as the only
> > > > > > benchmarks showing its benefit have been on Intel hardware.
> > > > > > ---
> > > > > > manual/tunables.texi | 16
> +++++++++++++++-
> > > > > > sysdeps/x86/cacheinfo.h | 8 +++++++-
> > > > > > sysdeps/x86/dl-cacheinfo.h | 16
> ++++++++++++++++
> > > > > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
> > > > > > sysdeps/x86/dl-tunables.list | 3 +++
> > > > > > sysdeps/x86/include/cpu-features.h | 4 +++-
> > > > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
> > > > > > 7 files changed, 49 insertions(+), 6 deletions(-)
> > > >
> > > > ...
> > > >
> > > > > > + /* Non-temporal stores in memset have only been tested on
> Intel hardware.
> > > > > > + Until we benchmark data on other x86 processor, disable
> non-temporal
> > > > > > + stores in memset. */
> > > >
> > > > Well, something's fishy here:
> > > >
> > > > $ ./elf/ld.so --list-tunables | grep threshold
> > > > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max:
> 0xffffffffffffffff)
> > > > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max:
> 0xfffffffffffffff)
> > > > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
> > > > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1,
> max: 0xffffffffffffffff)
> > > > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max:
> 0xffffffffffffffff)
> > > > ^^^^^^^^^
> > > >
> > > > on glibc-2.39.9000-300-g54c1efdac55b from git.
> > > >
> > > > That's on a AMD Zen1 so I'd expect that memset NT threshold to be
> > > > 0xffffffffffffffff by default...
> > > >
> > > > Thx.
> > > >
> > >
> > > Thanks for bringing this up, looking into it.
> >
> > Thx, so Michael did debug it yesterday to the ranges mismatching:
> >
> > diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
> > index 147cc4cf23f5..ecf3c1d3736e 100644
> > --- a/elf/dl-tunables.c
> > +++ b/elf/dl-tunables.c
> > @@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const
> tunable_val_t *valp,
> >
> > /* Bail out if the bounds are not valid. */
> > if (tunable_val_lt (val, min, unsigned_cmp)
> > - || tunable_val_lt (max, val, unsigned_cmp))
> > + || tunable_val_lt (max, val, unsigned_cmp)) {
> > + _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n",
> > + val, min, max);
> > return;
> > + }
> >
> > cur->val.numval = val;
> > cur->type.min = min;
> >
> > $ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)"
> > dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff
> > dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size:
> 0xffffffffffffffff
> > bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
> > ^^^^^^^
> >
> > dl_init_cacheinfo: memset_non_temporal_threshold, tunable set:
> 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff
> > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max:
> 0xffffffffffffffff)
> >
> > but you guys probably should do the right fix here.
>
> Just posted the fix, you should be CCd on it.
> >
> > Thx.
> >
> > --
> > Regards/Gruss,
> > Boris.
> >
> > https://people.kernel.org/tglx/notes-about-netiquette
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
Sunil Pandey via Libc-stable <libc-stable@sourceware.org> writes:
> I would like to backport this patch to release branches.
> Any comments or objections?
Beware: adding a tunable changes the ABI between ld.so and libc.so.
This may affect upgrades that don't expect that.
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.cpu.x86_shstk:
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages.
@cindex shared_cache_size tunables
@cindex tunables, shared_cache_size
@cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
@deftp {Tunable namespace} glibc.cpu
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -574,6 +576,18 @@ like memmove and memcpy.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
set threshold in bytes to start using "rep movsb". The value must be
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
-/* Threshold to use non temporal store. */
+/* Threshold to use non temporal store in memmove. */
long int __x86_shared_non_temporal_threshold attribute_hidden;
+/* Threshold to use non temporal store in memset. */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
/* Threshold to use Enhanced REP MOVSB. */
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
@@ -77,6 +80,9 @@ init_cacheinfo (void)
__x86_shared_non_temporal_threshold
= cpu_features->non_temporal_threshold;
+ __x86_memset_non_temporal_threshold
+ = cpu_features->memset_non_temporal_threshold;
+
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
@@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
+ Until we benchmark data on other x86 processor, disable non-temporal
+ stores in memset. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (cpu_features->basic.kind == arch_kind_intel)
+ memset_non_temporal_threshold = non_temporal_threshold;
+
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
&& tunable_size <= maximum_non_temporal_threshold)
non_temporal_threshold = tunable_size;
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ if (tunable_size > minimum_non_temporal_threshold
+ && tunable_size <= maximum_non_temporal_threshold)
+ memset_non_temporal_threshold = tunable_size;
+
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
if (tunable_size > minimum_rep_movsb_threshold)
rep_movsb_threshold = tunable_size;
@@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
cpu_features->non_temporal_threshold = non_temporal_threshold;
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
@@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void)
cpu_features->shared_cache_size);
print_cpu_features_value ("non_temporal_threshold",
cpu_features->non_temporal_threshold);
+ print_cpu_features_value ("memset_non_temporal_threshold",
+ cpu_features->memset_non_temporal_threshold);
print_cpu_features_value ("rep_movsb_threshold",
cpu_features->rep_movsb_threshold);
print_cpu_features_value ("rep_movsb_stop_threshold",
@@ -30,6 +30,9 @@ glibc {
x86_non_temporal_threshold {
type: SIZE_T
}
+ x86_memset_non_temporal_threshold {
+ type: SIZE_T
+ }
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP
@@ -944,8 +944,10 @@ struct cpu_features
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
unsigned long int shared_cache_size;
- /* Threshold to use non temporal store. */
+ /* Threshold to use non temporal store in memmove. */
unsigned long int non_temporal_threshold;
+ /* Threshold to use non temporal store in memset. */
+ unsigned long int memset_non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to stop using "rep movsb". */
@@ -24,9 +24,9 @@
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done.
6. On machines ERMS feature, if size is range
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
then REP STOSB will be used.
- 7. If size >= __x86_shared_non_temporal_threshold, use a
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
non-temporal stores. */
#include <sysdep.h>
@@ -318,7 +318,7 @@ L(return_vzeroupper):
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
range for 2-byte jump encoding. */
L(stosb_local):
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
jae L(nt_memset)
movzbl %sil, %eax
mov %RDX_LP, %RCX_LP