[v1,5/5] x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h

Message ID 20211101054952.2349590-5-goldstein.w.n@gmail.com
State Superseded
Delegated to: H.J. Lu
Headers
Series [v1,1/5] string: Make tests birdirectional test-memcpy.c |

Checks

Context Check Description
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein Nov. 1, 2021, 5:49 a.m. UTC
  No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62
---
 sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
  

Comments

H.J. Lu Nov. 6, 2021, 2:31 a.m. UTC | #1
On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> No bug.
> 
> This patch doubles the rep_movsb_threshold when using ERMS. Based on
> benchmarks the vector copy loop, especially now that it handles 4k
> aliasing, is better for these medium ranged.
> 
> On Skylake with ERMS:
> 
> Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> 4096,   0,      0,      0,      0.975
> 4096,   0,      0,      1,      0.953
> 4096,   12,     0,      0,      0.969
> 4096,   12,     0,      1,      0.872
> 4096,   44,     0,      0,      0.979
> 4096,   44,     0,      1,      0.83
> 4096,   0,      12,     0,      1.006
> 4096,   0,      12,     1,      0.989
> 4096,   0,      44,     0,      0.739
> 4096,   0,      44,     1,      0.942
> 4096,   12,     12,     0,      1.009
> 4096,   12,     12,     1,      0.973
> 4096,   44,     44,     0,      0.791
> 4096,   44,     44,     1,      0.961
> 4096,   2048,   0,      0,      0.978
> 4096,   2048,   0,      1,      0.951
> 4096,   2060,   0,      0,      0.986
> 4096,   2060,   0,      1,      0.963
> 4096,   2048,   12,     0,      0.971
> 4096,   2048,   12,     1,      0.941
> 4096,   2060,   12,     0,      0.977
> 4096,   2060,   12,     1,      0.949
> 8192,   0,      0,      0,      0.85
> 8192,   0,      0,      1,      0.845
> 8192,   13,     0,      0,      0.937
> 8192,   13,     0,      1,      0.939
> 8192,   45,     0,      0,      0.932
> 8192,   45,     0,      1,      0.927
> 8192,   0,      13,     0,      0.621
> 8192,   0,      13,     1,      0.62
> 8192,   0,      45,     0,      0.53
> 8192,   0,      45,     1,      0.516
> 8192,   13,     13,     0,      0.664
> 8192,   13,     13,     1,      0.659
> 8192,   45,     45,     0,      0.593
> 8192,   45,     45,     1,      0.575
> 8192,   2048,   0,      0,      0.854
> 8192,   2048,   0,      1,      0.834
> 8192,   2061,   0,      0,      0.863
> 8192,   2061,   0,      1,      0.857
> 8192,   2048,   13,     0,      0.63
> 8192,   2048,   13,     1,      0.629
> 8192,   2061,   13,     0,      0.627
> 8192,   2061,   13,     1,      0.62
> ---
>  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index e6c94dfd02..712b7c7fd0 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
>        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
>      {
> -      rep_movsb_threshold = 2048 * (64 / 16);
> +      rep_movsb_threshold = 4096 * (64 / 16);

Please also update the default of x86_rep_stosb_threshold in

sysdeps/x86/dl-tunables.list

>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 64 * 8;
>  #endif
> @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
>  				    AVX_Fast_Unaligned_Load))
>      {
> -      rep_movsb_threshold = 2048 * (32 / 16);
> +      rep_movsb_threshold = 4096 * (32 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 32 * 8;
>  #endif
>      }
>    else
>      {
> -      rep_movsb_threshold = 2048 * (16 / 16);
> +      rep_movsb_threshold = 4096 * (16 / 16);
>  #if HAVE_TUNABLES
>        minimum_rep_movsb_threshold = 16 * 8;
>  #endif
> @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>  
> +
> +
> +

Please don't add these blank lines.

>    unsigned long int rep_movsb_stop_threshold;
>    /* ERMS feature is implemented from AMD Zen3 architecture and it is
>       performing poorly for data above L2 cache size. Henceforth, adding
> -- 
> 2.25.1
> 

Thanks.

H.J.
  
Noah Goldstein Nov. 6, 2021, 4:39 a.m. UTC | #2
On Fri, Nov 5, 2021 at 9:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> > No bug.
> >
> > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > benchmarks the vector copy loop, especially now that it handles 4k
> > aliasing, is better for these medium ranged.
> >
> > On Skylake with ERMS:
> >
> > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > 4096,   0,      0,      0,      0.975
> > 4096,   0,      0,      1,      0.953
> > 4096,   12,     0,      0,      0.969
> > 4096,   12,     0,      1,      0.872
> > 4096,   44,     0,      0,      0.979
> > 4096,   44,     0,      1,      0.83
> > 4096,   0,      12,     0,      1.006
> > 4096,   0,      12,     1,      0.989
> > 4096,   0,      44,     0,      0.739
> > 4096,   0,      44,     1,      0.942
> > 4096,   12,     12,     0,      1.009
> > 4096,   12,     12,     1,      0.973
> > 4096,   44,     44,     0,      0.791
> > 4096,   44,     44,     1,      0.961
> > 4096,   2048,   0,      0,      0.978
> > 4096,   2048,   0,      1,      0.951
> > 4096,   2060,   0,      0,      0.986
> > 4096,   2060,   0,      1,      0.963
> > 4096,   2048,   12,     0,      0.971
> > 4096,   2048,   12,     1,      0.941
> > 4096,   2060,   12,     0,      0.977
> > 4096,   2060,   12,     1,      0.949
> > 8192,   0,      0,      0,      0.85
> > 8192,   0,      0,      1,      0.845
> > 8192,   13,     0,      0,      0.937
> > 8192,   13,     0,      1,      0.939
> > 8192,   45,     0,      0,      0.932
> > 8192,   45,     0,      1,      0.927
> > 8192,   0,      13,     0,      0.621
> > 8192,   0,      13,     1,      0.62
> > 8192,   0,      45,     0,      0.53
> > 8192,   0,      45,     1,      0.516
> > 8192,   13,     13,     0,      0.664
> > 8192,   13,     13,     1,      0.659
> > 8192,   45,     45,     0,      0.593
> > 8192,   45,     45,     1,      0.575
> > 8192,   2048,   0,      0,      0.854
> > 8192,   2048,   0,      1,      0.834
> > 8192,   2061,   0,      0,      0.863
> > 8192,   2061,   0,      1,      0.857
> > 8192,   2048,   13,     0,      0.63
> > 8192,   2048,   13,     1,      0.629
> > 8192,   2061,   13,     0,      0.627
> > 8192,   2061,   13,     1,      0.62
> > ---
> >  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
> >  1 file changed, 6 insertions(+), 3 deletions(-)
> >
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index e6c94dfd02..712b7c7fd0 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> >      {
> > -      rep_movsb_threshold = 2048 * (64 / 16);
> > +      rep_movsb_threshold = 4096 * (64 / 16);
>
> Please also update the default of x86_rep_stosb_threshold in

Do you know what to set it at?

I haven't tested recently but last time I checked stosb was significantly
better even for smaller values than movsb. Think it warrants another patch
as the numbers in this commit are for movsb and I don't think the two are
necessarily 1-1.

>
> sysdeps/x86/dl-tunables.list
>
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 64 * 8;
> >  #endif
> > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> >                                   AVX_Fast_Unaligned_Load))
> >      {
> > -      rep_movsb_threshold = 2048 * (32 / 16);
> > +      rep_movsb_threshold = 4096 * (32 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 32 * 8;
> >  #endif
> >      }
> >    else
> >      {
> > -      rep_movsb_threshold = 2048 * (16 / 16);
> > +      rep_movsb_threshold = 4096 * (16 / 16);
> >  #if HAVE_TUNABLES
> >        minimum_rep_movsb_threshold = 16 * 8;
> >  #endif
> > @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> >      rep_movsb_threshold = 2112;
> >
> > +
> > +
> > +
>
> Please don't add these blank lines.
Fixed.


>
> >    unsigned long int rep_movsb_stop_threshold;
> >    /* ERMS feature is implemented from AMD Zen3 architecture and it is
> >       performing poorly for data above L2 cache size. Henceforth, adding
> > --
> > 2.25.1
> >
>
> Thanks.
>
> H.J.
  
H.J. Lu Nov. 6, 2021, 12:04 p.m. UTC | #3
On Fri, Nov 5, 2021 at 9:39 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Nov 5, 2021 at 9:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> > > No bug.
> > >
> > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > benchmarks the vector copy loop, especially now that it handles 4k
> > > aliasing, is better for these medium ranged.
> > >
> > > On Skylake with ERMS:
> > >
> > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > 4096,   0,      0,      0,      0.975
> > > 4096,   0,      0,      1,      0.953
> > > 4096,   12,     0,      0,      0.969
> > > 4096,   12,     0,      1,      0.872
> > > 4096,   44,     0,      0,      0.979
> > > 4096,   44,     0,      1,      0.83
> > > 4096,   0,      12,     0,      1.006
> > > 4096,   0,      12,     1,      0.989
> > > 4096,   0,      44,     0,      0.739
> > > 4096,   0,      44,     1,      0.942
> > > 4096,   12,     12,     0,      1.009
> > > 4096,   12,     12,     1,      0.973
> > > 4096,   44,     44,     0,      0.791
> > > 4096,   44,     44,     1,      0.961
> > > 4096,   2048,   0,      0,      0.978
> > > 4096,   2048,   0,      1,      0.951
> > > 4096,   2060,   0,      0,      0.986
> > > 4096,   2060,   0,      1,      0.963
> > > 4096,   2048,   12,     0,      0.971
> > > 4096,   2048,   12,     1,      0.941
> > > 4096,   2060,   12,     0,      0.977
> > > 4096,   2060,   12,     1,      0.949
> > > 8192,   0,      0,      0,      0.85
> > > 8192,   0,      0,      1,      0.845
> > > 8192,   13,     0,      0,      0.937
> > > 8192,   13,     0,      1,      0.939
> > > 8192,   45,     0,      0,      0.932
> > > 8192,   45,     0,      1,      0.927
> > > 8192,   0,      13,     0,      0.621
> > > 8192,   0,      13,     1,      0.62
> > > 8192,   0,      45,     0,      0.53
> > > 8192,   0,      45,     1,      0.516
> > > 8192,   13,     13,     0,      0.664
> > > 8192,   13,     13,     1,      0.659
> > > 8192,   45,     45,     0,      0.593
> > > 8192,   45,     45,     1,      0.575
> > > 8192,   2048,   0,      0,      0.854
> > > 8192,   2048,   0,      1,      0.834
> > > 8192,   2061,   0,      0,      0.863
> > > 8192,   2061,   0,      1,      0.857
> > > 8192,   2048,   13,     0,      0.63
> > > 8192,   2048,   13,     1,      0.629
> > > 8192,   2061,   13,     0,      0.627
> > > 8192,   2061,   13,     1,      0.62
> > > ---
> > >  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
> > >  1 file changed, 6 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > index e6c94dfd02..712b7c7fd0 100644
> > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > +      rep_movsb_threshold = 4096 * (64 / 16);
> >
> > Please also update the default of x86_rep_stosb_threshold in
>
> Do you know what to set it at?

Oops.  I meant

    x86_rep_movsb_threshold {
      type: SIZE_T
      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
      # isn't faster on short data.  The memcpy micro benchmark in glibc
      # shows that 2KB is the approximate value above which REP MOVSB
      # becomes faster than SSE2 optimization on processors with Enhanced
      # REP MOVSB.  Since larger register size can move more data with a
      # single load and store, the threshold is higher with larger register
      # size.  Note: Since the REP MOVSB threshold must be greater than 8
      # times of vector size and the default value is 2048 * (vector size

       ^^^^^^^
      # / 16), the default value and the minimum value must be updated at
      # run-time.  NB: Don't set the default value since we can't tell if
      # the tunable value is set by user or not [BZ #27069].
      minval: 1
    }

> I haven't tested recently but last time I checked stosb was significantly
> better even for smaller values than movsb. Think it warrants another patch
> as the numbers in this commit are for movsb and I don't think the two are
> necessarily 1-1.
>
> >
> > sysdeps/x86/dl-tunables.list
> >
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 64 * 8;
> > >  #endif
> > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > >                                   AVX_Fast_Unaligned_Load))
> > >      {
> > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 32 * 8;
> > >  #endif
> > >      }
> > >    else
> > >      {
> > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > >  #if HAVE_TUNABLES
> > >        minimum_rep_movsb_threshold = 16 * 8;
> > >  #endif
> > > @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> > >      rep_movsb_threshold = 2112;
> > >
> > > +
> > > +
> > > +
> >
> > Please don't add these blank lines.
> Fixed.
>
>
> >
> > >    unsigned long int rep_movsb_stop_threshold;
> > >    /* ERMS feature is implemented from AMD Zen3 architecture and it is
> > >       performing poorly for data above L2 cache size. Henceforth, adding
> > > --
> > > 2.25.1
> > >
> >
> > Thanks.
> >
> > H.J.
  
Noah Goldstein Nov. 6, 2021, 5:38 p.m. UTC | #4
On Sat, Nov 6, 2021 at 7:05 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 5, 2021 at 9:39 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Nov 5, 2021 at 9:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Nov 01, 2021 at 12:49:52AM -0500, Noah Goldstein wrote:
> > > > No bug.
> > > >
> > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on
> > > > benchmarks the vector copy loop, especially now that it handles 4k
> > > > aliasing, is better for these medium ranged.
> > > >
> > > > On Skylake with ERMS:
> > > >
> > > > Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
> > > > 4096,   0,      0,      0,      0.975
> > > > 4096,   0,      0,      1,      0.953
> > > > 4096,   12,     0,      0,      0.969
> > > > 4096,   12,     0,      1,      0.872
> > > > 4096,   44,     0,      0,      0.979
> > > > 4096,   44,     0,      1,      0.83
> > > > 4096,   0,      12,     0,      1.006
> > > > 4096,   0,      12,     1,      0.989
> > > > 4096,   0,      44,     0,      0.739
> > > > 4096,   0,      44,     1,      0.942
> > > > 4096,   12,     12,     0,      1.009
> > > > 4096,   12,     12,     1,      0.973
> > > > 4096,   44,     44,     0,      0.791
> > > > 4096,   44,     44,     1,      0.961
> > > > 4096,   2048,   0,      0,      0.978
> > > > 4096,   2048,   0,      1,      0.951
> > > > 4096,   2060,   0,      0,      0.986
> > > > 4096,   2060,   0,      1,      0.963
> > > > 4096,   2048,   12,     0,      0.971
> > > > 4096,   2048,   12,     1,      0.941
> > > > 4096,   2060,   12,     0,      0.977
> > > > 4096,   2060,   12,     1,      0.949
> > > > 8192,   0,      0,      0,      0.85
> > > > 8192,   0,      0,      1,      0.845
> > > > 8192,   13,     0,      0,      0.937
> > > > 8192,   13,     0,      1,      0.939
> > > > 8192,   45,     0,      0,      0.932
> > > > 8192,   45,     0,      1,      0.927
> > > > 8192,   0,      13,     0,      0.621
> > > > 8192,   0,      13,     1,      0.62
> > > > 8192,   0,      45,     0,      0.53
> > > > 8192,   0,      45,     1,      0.516
> > > > 8192,   13,     13,     0,      0.664
> > > > 8192,   13,     13,     1,      0.659
> > > > 8192,   45,     45,     0,      0.593
> > > > 8192,   45,     45,     1,      0.575
> > > > 8192,   2048,   0,      0,      0.854
> > > > 8192,   2048,   0,      1,      0.834
> > > > 8192,   2061,   0,      0,      0.863
> > > > 8192,   2061,   0,      1,      0.857
> > > > 8192,   2048,   13,     0,      0.63
> > > > 8192,   2048,   13,     1,      0.629
> > > > 8192,   2061,   13,     0,      0.627
> > > > 8192,   2061,   13,     1,      0.62
> > > > ---
> > > >  sysdeps/x86/dl-cacheinfo.h | 9 ++++++---
> > > >  1 file changed, 6 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > > index e6c94dfd02..712b7c7fd0 100644
> > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > @@ -871,7 +871,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > > >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (64 / 16);
> > > > +      rep_movsb_threshold = 4096 * (64 / 16);
> > >
> > > Please also update the default of x86_rep_stosb_threshold in
> >
> > Do you know what to set it at?
>
> Oops.  I meant

ah. Fixed.

>
>     x86_rep_movsb_threshold {
>       type: SIZE_T
>       # Since there is overhead to set up REP MOVSB operation, REP MOVSB
>       # isn't faster on short data.  The memcpy micro benchmark in glibc
>       # shows that 2KB is the approximate value above which REP MOVSB
>       # becomes faster than SSE2 optimization on processors with Enhanced
>       # REP MOVSB.  Since larger register size can move more data with a
>       # single load and store, the threshold is higher with larger register
>       # size.  Note: Since the REP MOVSB threshold must be greater than 8
>       # times of vector size and the default value is 2048 * (vector size
>
>        ^^^^^^^
>       # / 16), the default value and the minimum value must be updated at
>       # run-time.  NB: Don't set the default value since we can't tell if
>       # the tunable value is set by user or not [BZ #27069].
>       minval: 1
>     }
>
> > I haven't tested recently but last time I checked stosb was significantly
> > better even for smaller values than movsb. Think it warrants another patch
> > as the numbers in this commit are for movsb and I don't think the two are
> > necessarily 1-1.
> >
> > >
> > > sysdeps/x86/dl-tunables.list
> > >
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 64 * 8;
> > > >  #endif
> > > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
> > > >                                   AVX_Fast_Unaligned_Load))
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (32 / 16);
> > > > +      rep_movsb_threshold = 4096 * (32 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 32 * 8;
> > > >  #endif
> > > >      }
> > > >    else
> > > >      {
> > > > -      rep_movsb_threshold = 2048 * (16 / 16);
> > > > +      rep_movsb_threshold = 4096 * (16 / 16);
> > > >  #if HAVE_TUNABLES
> > > >        minimum_rep_movsb_threshold = 16 * 8;
> > > >  #endif
> > > > @@ -896,6 +896,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> > > >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> > > >      rep_movsb_threshold = 2112;
> > > >
> > > > +
> > > > +
> > > > +
> > >
> > > Please don't add these blank lines.
> > Fixed.
> >
> >
> > >
> > > >    unsigned long int rep_movsb_stop_threshold;
> > > >    /* ERMS feature is implemented from AMD Zen3 architecture and it is
> > > >       performing poorly for data above L2 cache size. Henceforth, adding
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > Thanks.
> > >
> > > H.J.
>
>
>
> --
> H.J.
  

Patch

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd02..712b7c7fd0 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -871,7 +871,7 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,14 +879,14 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
     }
   else
     {
-      rep_movsb_threshold = 2048 * (16 / 16);
+      rep_movsb_threshold = 4096 * (16 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 16 * 8;
 #endif
@@ -896,6 +896,9 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
+
+
+
   unsigned long int rep_movsb_stop_threshold;
   /* ERMS feature is implemented from AMD Zen3 architecture and it is
      performing poorly for data above L2 cache size. Henceforth, adding