[v3,5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h

Message ID 20211106173706.3125357-5-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v3,1/5] string: Make tests birdirectional test-memcpy.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein Nov. 6, 2021, 5:37 p.m. UTC
  No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62
---
 sysdeps/x86/dl-cacheinfo.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
  

Comments

H.J. Lu Nov. 6, 2021, 5:56 p.m. UTC | #1
On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> No bug.
>
> This patch doubles the rep_movsb_threshold when using ERMS. Based on
> benchmarks the vector copy loop, especially now that it handles 4k
> aliasing, is better for these medium ranged.
>
> On Skylake with ERMS:
>
> Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> 4096,   0,      0,      0,      0.975
> 4096,   0,      0,      1,      0.953
> 4096,   12,     0,      0,      0.969
> 4096,   12,     0,      1,      0.872
> 4096,   44,     0,      0,      0.979
> 4096,   44,     0,      1,      0.83
> 4096,   0,      12,     0,      1.006
> 4096,   0,      12,     1,      0.989
> 4096,   0,      44,     0,      0.739
> 4096,   0,      44,     1,      0.942
> 4096,   12,     12,     0,      1.009
> 4096,   12,     12,     1,      0.973
> 4096,   44,     44,     0,      0.791
> 4096,   44,     44,     1,      0.961
> 4096,   2048,   0,      0,      0.978
> 4096,   2048,   0,      1,      0.951
> 4096,   2060,   0,      0,      0.986
> 4096,   2060,   0,      1,      0.963
> 4096,   2048,   12,     0,      0.971
> 4096,   2048,   12,     1,      0.941
> 4096,   2060,   12,     0,      0.977
> 4096,   2060,   12,     1,      0.949
> 8192,   0,      0,      0,      0.85
> 8192,   0,      0,      1,      0.845
> 8192,   13,     0,      0,      0.937
> 8192,   13,     0,      1,      0.939
> 8192,   45,     0,      0,      0.932
> 8192,   45,     0,      1,      0.927
> 8192,   0,      13,     0,      0.621
> 8192,   0,      13,     1,      0.62
> 8192,   0,      45,     0,      0.53
> 8192,   0,      45,     1,      0.516
> 8192,   13,     13,     0,      0.664
> 8192,   13,     13,     1,      0.659
> 8192,   45,     45,     0,      0.593
> 8192,   45,     45,     1,      0.575
> 8192,   2048,   0,      0,      0.854
> 8192,   2048,   0,      1,      0.834
> 8192,   2061,   0,      0,      0.863
> 8192,   2061,   0,      1,      0.857
> 8192,   2048,   13,     0,      0.63
> 8192,   2048,   13,     1,      0.629
> 8192,   2061,   13,     0,      0.627
> 8192,   2061,   13,     1,      0.62
> ---
>  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index e6c94dfd02..ceb3b53828 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
>    unsigned int minimum_rep_movsb_threshold;
>  #endif
> -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
>    unsigned int rep_movsb_threshold;
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
>      {
> -      rep_movsb_threshold = 2048 * (64 / 16);
> +      rep_movsb_threshold = 4096 * (64 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 64 * 8;
>  #endif
> @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
>                                     AVX_Fast_Unaligned_Load))
>      {
> -      rep_movsb_threshold = 2048 * (32 / 16);
> +      rep_movsb_threshold = 4096 * (32 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 32 * 8;
>  #endif
>      }
>    else
>      {
> -      rep_movsb_threshold = 2048 * (16 / 16);
> +      rep_movsb_threshold = 4096 * (16 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 16 * 8;
>  #endif
> --
> 2.25.1
>

You need to update comments for x86_rep_movsb_threshold
in sysdeps/x86/dl-tunables.list
  
Noah Goldstein Nov. 6, 2021, 6:11 p.m. UTC | #2
On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > No bug.
> >
> > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > benchmarks the vector copy loop, especially now that it handles 4k
> > aliasing, is better for these medium ranged.
> >
> > On Skylake with ERMS:
> >
> > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > 4096,   0,      0,      0,      0.975
> > 4096,   0,      0,      1,      0.953
> > 4096,   12,     0,      0,      0.969
> > 4096,   12,     0,      1,      0.872
> > 4096,   44,     0,      0,      0.979
> > 4096,   44,     0,      1,      0.83
> > 4096,   0,      12,     0,      1.006
> > 4096,   0,      12,     1,      0.989
> > 4096,   0,      44,     0,      0.739
> > 4096,   0,      44,     1,      0.942
> > 4096,   12,     12,     0,      1.009
> > 4096,   12,     12,     1,      0.973
> > 4096,   44,     44,     0,      0.791
> > 4096,   44,     44,     1,      0.961
> > 4096,   2048,   0,      0,      0.978
> > 4096,   2048,   0,      1,      0.951
> > 4096,   2060,   0,      0,      0.986
> > 4096,   2060,   0,      1,      0.963
> > 4096,   2048,   12,     0,      0.971
> > 4096,   2048,   12,     1,      0.941
> > 4096,   2060,   12,     0,      0.977
> > 4096,   2060,   12,     1,      0.949
> > 8192,   0,      0,      0,      0.85
> > 8192,   0,      0,      1,      0.845
> > 8192,   13,     0,      0,      0.937
> > 8192,   13,     0,      1,      0.939
> > 8192,   45,     0,      0,      0.932
> > 8192,   45,     0,      1,      0.927
> > 8192,   0,      13,     0,      0.621
> > 8192,   0,      13,     1,      0.62
> > 8192,   0,      45,     0,      0.53
> > 8192,   0,      45,     1,      0.516
> > 8192,   13,     13,     0,      0.664
> > 8192,   13,     13,     1,      0.659
> > 8192,   45,     45,     0,      0.593
> > 8192,   45,     45,     1,      0.575
> > 8192,   2048,   0,      0,      0.854
> > 8192,   2048,   0,      1,      0.834
> > 8192,   2061,   0,      0,      0.863
> > 8192,   2061,   0,      1,      0.857
> > 8192,   2048,   13,     0,      0.63
> > 8192,   2048,   13,     1,      0.629
> > 8192,   2061,   13,     0,      0.627
> > 8192,   2061,   13,     1,      0.62
> > ---
> >  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index e6c94dfd02..ceb3b53828 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> >    unsigned int minimum_rep_movsb_threshold;
> >  #endif
> > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
> >    unsigned int rep_movsb_threshold;
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> >      {
> > -      rep_movsb_threshold = 2048 * (64 / 16);
> > +      rep_movsb_threshold = 4096 * (64 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 64 * 8;
> >  #endif
> > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> >                                     AVX_Fast_Unaligned_Load))
> >      {
> > -      rep_movsb_threshold = 2048 * (32 / 16);
> > +      rep_movsb_threshold = 4096 * (32 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 32 * 8;
> >  #endif
> >      }
> >    else
> >      {
> > -      rep_movsb_threshold = 2048 * (16 / 16);
> > +      rep_movsb_threshold = 4096 * (16 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 16 * 8;
> >  #endif
> > --
> > 2.25.1
> >
>
> You need to update comments for x86_rep_movsb_threshold
> in sysdeps/x86/dl-tunables.list

Can do.

Noticing that the original values were based on comparisons with SSE2 likely on
SnB or IVB. I don't have any indication that the 2048 value is not
optimal for those
processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2?
>
> --
> H.J.
  
H.J. Lu Nov. 6, 2021, 6:21 p.m. UTC | #3
On Sat, Nov 6, 2021 at 11:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > No bug.
> > >
> > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > benchmarks the vector copy loop, especially now that it handles 4k
> > > aliasing, is better for these medium ranged.
> > >
> > > On Skylake with ERMS:
> > >
> > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > 4096,   0,      0,      0,      0.975
> > > 4096,   0,      0,      1,      0.953
> > > 4096,   12,     0,      0,      0.969
> > > 4096,   12,     0,      1,      0.872
> > > 4096,   44,     0,      0,      0.979
> > > 4096,   44,     0,      1,      0.83
> > > 4096,   0,      12,     0,      1.006
> > > 4096,   0,      12,     1,      0.989
> > > 4096,   0,      44,     0,      0.739
> > > 4096,   0,      44,     1,      0.942
> > > 4096,   12,     12,     0,      1.009
> > > 4096,   12,     12,     1,      0.973
> > > 4096,   44,     44,     0,      0.791
> > > 4096,   44,     44,     1,      0.961
> > > 4096,   2048,   0,      0,      0.978
> > > 4096,   2048,   0,      1,      0.951
> > > 4096,   2060,   0,      0,      0.986
> > > 4096,   2060,   0,      1,      0.963
> > > 4096,   2048,   12,     0,      0.971
> > > 4096,   2048,   12,     1,      0.941
> > > 4096,   2060,   12,     0,      0.977
> > > 4096,   2060,   12,     1,      0.949
> > > 8192,   0,      0,      0,      0.85
> > > 8192,   0,      0,      1,      0.845
> > > 8192,   13,     0,      0,      0.937
> > > 8192,   13,     0,      1,      0.939
> > > 8192,   45,     0,      0,      0.932
> > > 8192,   45,     0,      1,      0.927
> > > 8192,   0,      13,     0,      0.621
> > > 8192,   0,      13,     1,      0.62
> > > 8192,   0,      45,     0,      0.53
> > > 8192,   0,      45,     1,      0.516
> > > 8192,   13,     13,     0,      0.664
> > > 8192,   13,     13,     1,      0.659
> > > 8192,   45,     45,     0,      0.593
> > > 8192,   45,     45,     1,      0.575
> > > 8192,   2048,   0,      0,      0.854
> > > 8192,   2048,   0,      1,      0.834
> > > 8192,   2061,   0,      0,      0.863
> > > 8192,   2061,   0,      1,      0.857
> > > 8192,   2048,   13,     0,      0.63
> > > 8192,   2048,   13,     1,      0.629
> > > 8192,   2061,   13,     0,      0.627
> > > 8192,   2061,   13,     1,      0.62
> > > ---
> > >  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
> > >  1 file changed, 4 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > index e6c94dfd02..ceb3b53828 100644
> > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> > >    unsigned int minimum_rep_movsb_threshold;
> > >  #endif
> > > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
> > >    unsigned int rep_movsb_threshold;
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > +      rep_movsb_threshold = 4096 * (64 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 64 * 8;
> > >  #endif
> > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > >                                     AVX_Fast_Unaligned_Load))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 32 * 8;
> > >  #endif
> > >      }
> > >    else
> > >      {
> > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 16 * 8;
> > >  #endif
> > > --
> > > 2.25.1
> > >
> >
> > You need to update comments for x86_rep_movsb_threshold
> > in sysdeps/x86/dl-tunables.list
>
> Can do.
>
> Noticing that the original values were based on comparisons with SSE2 likely on
> SnB or IVB. I don't have any indication that the 2048 value is not
> optimal for those
> processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2?

Good idea.   So change the threshold to 2048 * (VEC_SIZE / 16) *
(VEC_SIZE / 16)?
  
Noah Goldstein Nov. 6, 2021, 6:34 p.m. UTC | #4
On Sat, Nov 6, 2021 at 1:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Nov 6, 2021 at 11:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > No bug.
> > > >
> > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > > benchmarks the vector copy loop, especially now that it handles 4k
> > > > aliasing, is better for these medium ranged.
> > > >
> > > > On Skylake with ERMS:
> > > >
> > > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > > 4096,   0,      0,      0,      0.975
> > > > 4096,   0,      0,      1,      0.953
> > > > 4096,   12,     0,      0,      0.969
> > > > 4096,   12,     0,      1,      0.872
> > > > 4096,   44,     0,      0,      0.979
> > > > 4096,   44,     0,      1,      0.83
> > > > 4096,   0,      12,     0,      1.006
> > > > 4096,   0,      12,     1,      0.989
> > > > 4096,   0,      44,     0,      0.739
> > > > 4096,   0,      44,     1,      0.942
> > > > 4096,   12,     12,     0,      1.009
> > > > 4096,   12,     12,     1,      0.973
> > > > 4096,   44,     44,     0,      0.791
> > > > 4096,   44,     44,     1,      0.961
> > > > 4096,   2048,   0,      0,      0.978
> > > > 4096,   2048,   0,      1,      0.951
> > > > 4096,   2060,   0,      0,      0.986
> > > > 4096,   2060,   0,      1,      0.963
> > > > 4096,   2048,   12,     0,      0.971
> > > > 4096,   2048,   12,     1,      0.941
> > > > 4096,   2060,   12,     0,      0.977
> > > > 4096,   2060,   12,     1,      0.949
> > > > 8192,   0,      0,      0,      0.85
> > > > 8192,   0,      0,      1,      0.845
> > > > 8192,   13,     0,      0,      0.937
> > > > 8192,   13,     0,      1,      0.939
> > > > 8192,   45,     0,      0,      0.932
> > > > 8192,   45,     0,      1,      0.927
> > > > 8192,   0,      13,     0,      0.621
> > > > 8192,   0,      13,     1,      0.62
> > > > 8192,   0,      45,     0,      0.53
> > > > 8192,   0,      45,     1,      0.516
> > > > 8192,   13,     13,     0,      0.664
> > > > 8192,   13,     13,     1,      0.659
> > > > 8192,   45,     45,     0,      0.593
> > > > 8192,   45,     45,     1,      0.575
> > > > 8192,   2048,   0,      0,      0.854
> > > > 8192,   2048,   0,      1,      0.834
> > > > 8192,   2061,   0,      0,      0.863
> > > > 8192,   2061,   0,      1,      0.857
> > > > 8192,   2048,   13,     0,      0.63
> > > > 8192,   2048,   13,     1,      0.629
> > > > 8192,   2061,   13,     0,      0.627
> > > > 8192,   2061,   13,     1,      0.62
> > > > ---
> > > >  sysdeps/x86/dl-cacheinfo.h | 8 ++++----
> > > >  1 file changed, 4 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > > index e6c94dfd02..ceb3b53828 100644
> > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> > > >    unsigned int minimum_rep_movsb_threshold;
> > > >  #endif
> > > > -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> > > > +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
> > > >    unsigned int rep_movsb_threshold;
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > > +      rep_movsb_threshold = 4096 * (64 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 64 * 8;
> > > >  #endif
> > > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > > >                                     AVX_Fast_Unaligned_Load))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 32 * 8;
> > > >  #endif
> > > >      }
> > > >    else
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 16 * 8;
> > > >  #endif
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > You need to update comments for x86_rep_movsb_threshold
> > > in sysdeps/x86/dl-tunables.list
> >
> > Can do.
> >
> > Noticing that the original values were based on comparisons with SSE2 likely on
> > SnB or IVB. I don't have any indication that the 2048 value is not
> > optimal for those
> > processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2?
>
> Good idea.   So change the threshold to 2048 * (VEC_SIZE / 16) *
> (VEC_SIZE / 16)?

Done and updated the comments explaining the thresholds.

>
> --
> H.J.
  

Patch

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd02..ceb3b53828 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,12 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,14 +879,14 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
     }
   else
     {
-      rep_movsb_threshold = 2048 * (16 / 16);
+      rep_movsb_threshold = 4096 * (16 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 16 * 8;
 #endif