x86: Add thresholds for "rep movsb/stosb" to tunables

Message ID CAMe9rOoYXMdOfedtZLx=GT-nFXThzoo7Q__H4vg=2vyOufGY6A@mail.gmail.com
State Superseded
Headers
Series x86: Add thresholds for "rep movsb/stosb" to tunables |

Commit Message

H.J. Lu May 23, 2020, 4:37 a.m. UTC
  On Fri, May 22, 2020 at 9:10 PM liqingqing <liqingqing3@huawei.com> wrote:
>
> this commitid 830566307f038387ca0af3fd327706a8d1a2f595 optimize implementation of function memset,
> and set macro REP_STOSB_THRESHOLD's default value to 2KB, when the input value is less than 2KB, the data flow is the same, and when the input value is large than 2KB,
> this api will use STOB to instead of  MOVQ
>
> but when I test this API on x86_64 platform
> and found that this default value is not appropriate for some input length. here it's the enviornment and result
>
> test suite: libMicro-0.4.0
>         ./memset -E -C 200 -L -S -W -N "memset_4k"    -s 4k    -I 250
>         ./memset -E -C 200 -L -S -W -N "memset_4k_uc" -s 4k    -u -I 400
>         ./memset -E -C 200 -L -S -W -N "memset_1m"    -s 1m   -I 200000
>         ./memset -E -C 200 -L -S -W -N "memset_10m"   -s 10m -I 2000000
>
> hardware platform:
>         Intel(R) Xeon(R) Gold 6266C CPU @ 3.00GHz
>         L1d cache:32KB
>         L1i cache: 32KB
>         L2 cache: 1MB
>         L3 cache: 60MB
>
> the result is that when input length is between the processor's L1 data cache and L2 cache size, the REP_STOSB_THRESHOLD=2KB will reduce performance.
>
>         before this commit     after this commit
>                 cycle      cycle
> memset_4k       249         96
> memset_10k      657         185
> memset_36k      2773        3767
> memset_100k     7594        10002
> memset_500k     37678       52149
> memset_1m       86780       108044
> memset_10m      1307238     1148994
>
>         before this commit          after this commit
>            MLC cache miss(10sec)         MLC cache miss(10sec)
> memset_4k       1,09,33,823          1,01,79,270
> memset_10k      1,23,78,958          1,05,41,087
> memset_36k      3,61,64,244          4,07,22,429
> memset_100k     8,25,33,052          9,31,81,253
> memset_500k     37,32,55,449         43,56,70,395
> memset_1m       75,16,28,239         88,29,90,237
> memset_10m      9,36,61,67,397       8,96,69,49,522
>
>
> though REP_STOSB_THRESHOLD can be modified at the building time by use -DREP_STOSB_THRESHOLD=xxx,
> but I think the default value may be is not a better one, cause I think most of the processor's L2 cache is large than 2KB, so i submit a patch as below:
>
>
>
> From 44314a556239a7524b5a6451025737c1bdbb1cd0 Mon Sep 17 00:00:00 2001
> From: liqingqing <liqingqing3@huawei.com>
> Date: Thu, 21 May 2020 11:23:06 +0800
> Subject: [PATCH] update REP_STOSB_THRESHOLD's default value from 2k to 1M
> macro REP_STOSB_THRESHOLD's value will reduce memset performace when input length is between processor's L1 data cache and L2 cache.
> so update the defaule value to eliminate the decrement .
>

There is no single threshold value which is good for all workloads.
I don't think we should change REP_STOSB_THRESHOLD to 1MB.
On the other hand, the fixed threshold isn't flexible.  Please try this
patch to see if you can set the threshold for your specific workload.
  

Comments

H.J. Lu May 28, 2020, 11:56 a.m. UTC | #1
On Fri, May 22, 2020 at 9:37 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, May 22, 2020 at 9:10 PM liqingqing <liqingqing3@huawei.com> wrote:
> >
> > this commitid 830566307f038387ca0af3fd327706a8d1a2f595 optimize implementation of function memset,
> > and set macro REP_STOSB_THRESHOLD's default value to 2KB, when the input value is less than 2KB, the data flow is the same, and when the input value is large than 2KB,
> > this api will use STOB to instead of  MOVQ
> >
> > but when I test this API on x86_64 platform
> > and found that this default value is not appropriate for some input length. here it's the enviornment and result
> >
> > test suite: libMicro-0.4.0
> >         ./memset -E -C 200 -L -S -W -N "memset_4k"    -s 4k    -I 250
> >         ./memset -E -C 200 -L -S -W -N "memset_4k_uc" -s 4k    -u -I 400
> >         ./memset -E -C 200 -L -S -W -N "memset_1m"    -s 1m   -I 200000
> >         ./memset -E -C 200 -L -S -W -N "memset_10m"   -s 10m -I 2000000
> >
> > hardware platform:
> >         Intel(R) Xeon(R) Gold 6266C CPU @ 3.00GHz
> >         L1d cache:32KB
> >         L1i cache: 32KB
> >         L2 cache: 1MB
> >         L3 cache: 60MB
> >
> > the result is that when input length is between the processor's L1 data cache and L2 cache size, the REP_STOSB_THRESHOLD=2KB will reduce performance.
> >
> >         before this commit     after this commit
> >                 cycle      cycle
> > memset_4k       249         96
> > memset_10k      657         185
> > memset_36k      2773        3767
> > memset_100k     7594        10002
> > memset_500k     37678       52149
> > memset_1m       86780       108044
> > memset_10m      1307238     1148994
> >
> >         before this commit          after this commit
> >            MLC cache miss(10sec)         MLC cache miss(10sec)
> > memset_4k       1,09,33,823          1,01,79,270
> > memset_10k      1,23,78,958          1,05,41,087
> > memset_36k      3,61,64,244          4,07,22,429
> > memset_100k     8,25,33,052          9,31,81,253
> > memset_500k     37,32,55,449         43,56,70,395
> > memset_1m       75,16,28,239         88,29,90,237
> > memset_10m      9,36,61,67,397       8,96,69,49,522
> >
> >
> > though REP_STOSB_THRESHOLD can be modified at the building time by use -DREP_STOSB_THRESHOLD=xxx,
> > but I think the default value may be is not a better one, cause I think most of the processor's L2 cache is large than 2KB, so i submit a patch as below:
> >
> >
> >
> > From 44314a556239a7524b5a6451025737c1bdbb1cd0 Mon Sep 17 00:00:00 2001
> > From: liqingqing <liqingqing3@huawei.com>
> > Date: Thu, 21 May 2020 11:23:06 +0800
> > Subject: [PATCH] update REP_STOSB_THRESHOLD's default value from 2k to 1M
> > macro REP_STOSB_THRESHOLD's value will reduce memset performace when input length is between processor's L1 data cache and L2 cache.
> > so update the defaule value to eliminate the decrement .
> >
>
> There is no single threshold value which is good for all workloads.
> I don't think we should change REP_STOSB_THRESHOLD to 1MB.
> On the other hand, the fixed threshold isn't flexible.  Please try this
> patch to see if you can set the threshold for your specific workload.
>

Any comments, objections?

https://sourceware.org/pipermail/libc-alpha/2020-May/114281.html
  
Qingqing Li May 28, 2020, 1:47 p.m. UTC | #2
Hi Lu, thank you for your comment.
the REP_STOSB_THRESHOLD value 2M it's suit for the hardware platform what I used.
Cause I do not have some other x86 enviornments, so I can't make sure this change is good for all of it and  you are right.


On 2020/5/28 19:56, H.J. Lu wrote:
> On Fri, May 22, 2020 at 9:37 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Fri, May 22, 2020 at 9:10 PM liqingqing <liqingqing3@huawei.com> wrote:
>>>
>>> this commitid 830566307f038387ca0af3fd327706a8d1a2f595 optimize implementation of function memset,
>>> and set macro REP_STOSB_THRESHOLD's default value to 2KB, when the input value is less than 2KB, the data flow is the same, and when the input value is large than 2KB,
>>> this api will use STOB to instead of  MOVQ
>>>
>>> but when I test this API on x86_64 platform
>>> and found that this default value is not appropriate for some input length. here it's the enviornment and result
>>>
>>> test suite: libMicro-0.4.0
>>>         ./memset -E -C 200 -L -S -W -N "memset_4k"    -s 4k    -I 250
>>>         ./memset -E -C 200 -L -S -W -N "memset_4k_uc" -s 4k    -u -I 400
>>>         ./memset -E -C 200 -L -S -W -N "memset_1m"    -s 1m   -I 200000
>>>         ./memset -E -C 200 -L -S -W -N "memset_10m"   -s 10m -I 2000000
>>>
>>> hardware platform:
>>>         Intel(R) Xeon(R) Gold 6266C CPU @ 3.00GHz
>>>         L1d cache:32KB
>>>         L1i cache: 32KB
>>>         L2 cache: 1MB
>>>         L3 cache: 60MB
>>>
>>> the result is that when input length is between the processor's L1 data cache and L2 cache size, the REP_STOSB_THRESHOLD=2KB will reduce performance.
>>>
>>>         before this commit     after this commit
>>>                 cycle      cycle
>>> memset_4k       249         96
>>> memset_10k      657         185
>>> memset_36k      2773        3767
>>> memset_100k     7594        10002
>>> memset_500k     37678       52149
>>> memset_1m       86780       108044
>>> memset_10m      1307238     1148994
>>>
>>>         before this commit          after this commit
>>>            MLC cache miss(10sec)         MLC cache miss(10sec)
>>> memset_4k       1,09,33,823          1,01,79,270
>>> memset_10k      1,23,78,958          1,05,41,087
>>> memset_36k      3,61,64,244          4,07,22,429
>>> memset_100k     8,25,33,052          9,31,81,253
>>> memset_500k     37,32,55,449         43,56,70,395
>>> memset_1m       75,16,28,239         88,29,90,237
>>> memset_10m      9,36,61,67,397       8,96,69,49,522
>>>
>>>
>>> though REP_STOSB_THRESHOLD can be modified at the building time by use -DREP_STOSB_THRESHOLD=xxx,
>>> but I think the default value may be is not a better one, cause I think most of the processor's L2 cache is large than 2KB, so i submit a patch as below:
>>>
>>>
>>>
>>> From 44314a556239a7524b5a6451025737c1bdbb1cd0 Mon Sep 17 00:00:00 2001
>>> From: liqingqing <liqingqing3@huawei.com>
>>> Date: Thu, 21 May 2020 11:23:06 +0800
>>> Subject: [PATCH] update REP_STOSB_THRESHOLD's default value from 2k to 1M
>>> macro REP_STOSB_THRESHOLD's value will reduce memset performace when input length is between processor's L1 data cache and L2 cache.
>>> so update the defaule value to eliminate the decrement .
>>>
>>
>> There is no single threshold value which is good for all workloads.
>> I don't think we should change REP_STOSB_THRESHOLD to 1MB.
>> On the other hand, the fixed threshold isn't flexible.  Please try this
>> patch to see if you can set the threshold for your specific workload.
>>
> 
> Any comments, objections?
> 
> https://sourceware.org/pipermail/libc-alpha/2020-May/114281.html
>
  
Carlos O'Donell May 29, 2020, 1:13 p.m. UTC | #3
On 5/23/20 12:37 AM, H.J. Lu via Libc-alpha wrote:
> There is no single threshold value which is good for all workloads.
> I don't think we should change REP_STOSB_THRESHOLD to 1MB.
> On the other hand, the fixed threshold isn't flexible.  Please try this
> patch to see if you can set the threshold for your specific workload.

My request here is that the manual include a documentation of what the
minimums are for the tunable. Even an example reference of the minimum
value would be useful for the tunable e.g. On AVX512 systems this value
is X, on AVX systems this value is Y, on all other systems Z.
  
H.J. Lu May 29, 2020, 1:21 p.m. UTC | #4
On Fri, May 29, 2020 at 6:13 AM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On 5/23/20 12:37 AM, H.J. Lu via Libc-alpha wrote:
> > There is no single threshold value which is good for all workloads.
> > I don't think we should change REP_STOSB_THRESHOLD to 1MB.
> > On the other hand, the fixed threshold isn't flexible.  Please try this
> > patch to see if you can set the threshold for your specific workload.
>
> My request here is that the manual include a documentation of what the
> minimums are for the tunable. Even an example reference of the minimum
> value would be useful for the tunable e.g. On AVX512 systems this value
> is X, on AVX systems this value is Y, on all other systems Z.
>

The logic of thresholds are:

 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
  unsigned int minimum_rep_movsb_threshold;
  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
  unsigned int rep_movsb_threshold;
  if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
    {
      rep_movsb_threshold = 2048 * (64 / 16);
      minimum_rep_movsb_threshold = 64 * 8;
    }
  else if (CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load))
    {
      rep_movsb_threshold = 2048 * (32 / 16);
      minimum_rep_movsb_threshold = 32 * 8;
    }
  else
    {
      rep_movsb_threshold = 2048 * (16 / 16);
      minimum_rep_movsb_threshold = 16 * 8;
    }
  if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
  else
    __x86_rep_movsb_threshold = rep_movsb_threshold;

We can't simply say AVX512 machines will use ZMM and AVX machines
will use YMM.  It depends on other factors which are invisible to users.
Can you suggest some paragraph for libc manual?

Thanks.
  
Carlos O'Donell May 29, 2020, 4:18 p.m. UTC | #5
On 5/29/20 9:21 AM, H.J. Lu wrote:
> On Fri, May 29, 2020 at 6:13 AM Carlos O'Donell <carlos@redhat.com> wrote:
>>
>> On 5/23/20 12:37 AM, H.J. Lu via Libc-alpha wrote:
>>> There is no single threshold value which is good for all workloads.
>>> I don't think we should change REP_STOSB_THRESHOLD to 1MB.
>>> On the other hand, the fixed threshold isn't flexible.  Please try this
>>> patch to see if you can set the threshold for your specific workload.
>>
>> My request here is that the manual include a documentation of what the
>> minimums are for the tunable. Even an example reference of the minimum
>> value would be useful for the tunable e.g. On AVX512 systems this value
>> is X, on AVX systems this value is Y, on all other systems Z.
>>
> 
> The logic of thresholds are:
> 
>  /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
>   unsigned int minimum_rep_movsb_threshold;
>   /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
>   unsigned int rep_movsb_threshold;
>   if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
>       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
>     {
>       rep_movsb_threshold = 2048 * (64 / 16);
>       minimum_rep_movsb_threshold = 64 * 8;
>     }
>   else if (CPU_FEATURES_ARCH_P (cpu_features,
> AVX_Fast_Unaligned_Load))
>     {
>       rep_movsb_threshold = 2048 * (32 / 16);
>       minimum_rep_movsb_threshold = 32 * 8;
>     }
>   else
>     {
>       rep_movsb_threshold = 2048 * (16 / 16);
>       minimum_rep_movsb_threshold = 16 * 8;
>     }
>   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
>     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>   else
>     __x86_rep_movsb_threshold = rep_movsb_threshold;
> 
> We can't simply say AVX512 machines will use ZMM and AVX machines
> will use YMM.  It depends on other factors which are invisible to users.
> Can you suggest some paragraph for libc manual?

We must tell the users the lower limit, so they can avoid having their
settings ignored.

If we can't tell them the lower limit in the manual, then we must add
a way to print it.

Augment the libc.so.6 main() entry point to print all tunables with
a --list-tunables option and print the limit? Then in the manual just
say you have to look it up?
  
H.J. Lu June 1, 2020, 7:32 p.m. UTC | #6
On Fri, May 29, 2020 at 9:18 AM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On 5/29/20 9:21 AM, H.J. Lu wrote:
> > On Fri, May 29, 2020 at 6:13 AM Carlos O'Donell <carlos@redhat.com> wrote:
> >>
> >> On 5/23/20 12:37 AM, H.J. Lu via Libc-alpha wrote:
> >>> There is no single threshold value which is good for all workloads.
> >>> I don't think we should change REP_STOSB_THRESHOLD to 1MB.
> >>> On the other hand, the fixed threshold isn't flexible.  Please try this
> >>> patch to see if you can set the threshold for your specific workload.
> >>
> >> My request here is that the manual include a documentation of what the
> >> minimums are for the tunable. Even an example reference of the minimum
> >> value would be useful for the tunable e.g. On AVX512 systems this value
> >> is X, on AVX systems this value is Y, on all other systems Z.
> >>
> >
> > The logic of thresholds are:
> >
> >  /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
> >   unsigned int minimum_rep_movsb_threshold;
> >   /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> >   unsigned int rep_movsb_threshold;
> >   if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
> >       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> >     {
> >       rep_movsb_threshold = 2048 * (64 / 16);
> >       minimum_rep_movsb_threshold = 64 * 8;
> >     }
> >   else if (CPU_FEATURES_ARCH_P (cpu_features,
> > AVX_Fast_Unaligned_Load))
> >     {
> >       rep_movsb_threshold = 2048 * (32 / 16);
> >       minimum_rep_movsb_threshold = 32 * 8;
> >     }
> >   else
> >     {
> >       rep_movsb_threshold = 2048 * (16 / 16);
> >       minimum_rep_movsb_threshold = 16 * 8;
> >     }
> >   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
> >     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> >   else
> >     __x86_rep_movsb_threshold = rep_movsb_threshold;
> >
> > We can't simply say AVX512 machines will use ZMM and AVX machines
> > will use YMM.  It depends on other factors which are invisible to users.
> > Can you suggest some paragraph for libc manual?
>
> We must tell the users the lower limit, so they can avoid having their
> settings ignored.
>
> If we can't tell them the lower limit in the manual, then we must add
> a way to print it.
>
> Augment the libc.so.6 main() entry point to print all tunables with
> a --list-tunables option and print the limit? Then in the manual just

Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
any arguments.

> say you have to look it up?
>
> --
> Cheers,
> Carlos.
>
  
Carlos O'Donell June 1, 2020, 7:38 p.m. UTC | #7
On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
> any arguments.

Yes, I mean adding argument processing to libc.so.6, and handling
--list-tunables.

We have enough infrastructure in place that wiring that up shouldn't be too bad?

Then, even in trimmed down containers, you can just run
/lib64/libc.so.6 --list-tunables and get back the list of tunables and
their min, max, and security values.

The alternative is a glibc-tunables binary which does only this, but
that seems like waste.

Cheers,
Carlos.
  
H.J. Lu June 1, 2020, 8:15 p.m. UTC | #8
On Mon, Jun 1, 2020 at 12:38 PM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
> > any arguments.
>
> Yes, I mean adding argument processing to libc.so.6, and handling
> --list-tunables.
>
> We have enough infrastructure in place that wiring that up shouldn't be too bad?
>
> Then, even in trimmed down containers, you can just run
> /lib64/libc.so.6 --list-tunables and get back the list of tunables and
> their min, max, and security values.

Adding an argument to libc.so.6 is difficult since argument passing is
processor specific.  Adding --list-tunables to ld.so is more doable.

> The alternative is a glibc-tunables binary which does only this, but
> that seems like waste.
>
> Cheers,
> Carlos.
>
  
H.J. Lu June 1, 2020, 8:19 p.m. UTC | #9
On Mon, Jun 1, 2020 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 1, 2020 at 12:38 PM Carlos O'Donell <carlos@redhat.com> wrote:
> >
> > On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
> > > any arguments.
> >
> > Yes, I mean adding argument processing to libc.so.6, and handling
> > --list-tunables.
> >
> > We have enough infrastructure in place that wiring that up shouldn't be too bad?
> >
> > Then, even in trimmed down containers, you can just run
> > /lib64/libc.so.6 --list-tunables and get back the list of tunables and
> > their min, max, and security values.
>
> Adding an argument to libc.so.6 is difficult since argument passing is
> processor specific.  Adding --list-tunables to ld.so is more doable.

But tunables are in libc.so.
  
Florian Weimer June 1, 2020, 8:48 p.m. UTC | #10
* H. J. Lu via Libc-alpha:

> On Mon, Jun 1, 2020 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Mon, Jun 1, 2020 at 12:38 PM Carlos O'Donell <carlos@redhat.com> wrote:
>> >
>> > On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> > > Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
>> > > any arguments.
>> >
>> > Yes, I mean adding argument processing to libc.so.6, and handling
>> > --list-tunables.
>> >
>> > We have enough infrastructure in place that wiring that up shouldn't be too bad?
>> >
>> > Then, even in trimmed down containers, you can just run
>> > /lib64/libc.so.6 --list-tunables and get back the list of tunables and
>> > their min, max, and security values.
>>
>> Adding an argument to libc.so.6 is difficult since argument passing is
>> processor specific.  Adding --list-tunables to ld.so is more doable.
>
> But tunables are in libc.so.

If this is really a problem, we can load libc.so and call a
GLIBC_PRIVATE function to print the information.
  
Carlos O'Donell June 1, 2020, 8:56 p.m. UTC | #11
On Mon, Jun 1, 2020 at 4:48 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>
> * H. J. Lu via Libc-alpha:
>
> > On Mon, Jun 1, 2020 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Mon, Jun 1, 2020 at 12:38 PM Carlos O'Donell <carlos@redhat.com> wrote:
> >> >
> >> > On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >> > > Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
> >> > > any arguments.
> >> >
> >> > Yes, I mean adding argument processing to libc.so.6, and handling
> >> > --list-tunables.
> >> >
> >> > We have enough infrastructure in place that wiring that up shouldn't be too bad?
> >> >
> >> > Then, even in trimmed down containers, you can just run
> >> > /lib64/libc.so.6 --list-tunables and get back the list of tunables and
> >> > their min, max, and security values.
> >>
> >> Adding an argument to libc.so.6 is difficult since argument passing is
> >> processor specific.  Adding --list-tunables to ld.so? is more doable.
> >
> > But tunables are in libc.so.
>
> If this is really a problem, we can load libc.so and call a
> GLIBC_PRIVATE function to print the information.

Agreed.

Please keep in mind the original problem we are trying to solve.

We want a tunable for a parameter that is difficult to explain to the user.

To make it easier for our users to use the tunable we are going to
provide them a way to look at the tunable settings in detail.

Yes, it requires a target system, but we can't avoid that in some cases.

Cheers,
Carlos.
  
H.J. Lu June 1, 2020, 9:13 p.m. UTC | #12
On Mon, Jun 1, 2020 at 1:57 PM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On Mon, Jun 1, 2020 at 4:48 PM Florian Weimer <fw@deneb.enyo.de> wrote:
> >
> > * H. J. Lu via Libc-alpha:
> >
> > > On Mon, Jun 1, 2020 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >>
> > >> On Mon, Jun 1, 2020 at 12:38 PM Carlos O'Donell <carlos@redhat.com> wrote:
> > >> >
> > >> > On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >> > > Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
> > >> > > any arguments.
> > >> >
> > >> > Yes, I mean adding argument processing to libc.so.6, and handling
> > >> > --list-tunables.
> > >> >
> > >> > We have enough infrastructure in place that wiring that up shouldn't be too bad?
> > >> >
> > >> > Then, even in trimmed down containers, you can just run
> > >> > /lib64/libc.so.6 --list-tunables and get back the list of tunables and
> > >> > their min, max, and security values.
> > >>
> > >> Adding an argument to libc.so.6 is difficult since argument passing is
> > >> processor specific.  Adding --list-tunables to ld.so? is more doable.
> > >
> > > But tunables are in libc.so.
> >
> > If this is really a problem, we can load libc.so and call a
> > GLIBC_PRIVATE function to print the information.
>
> Agreed.
>
> Please keep in mind the original problem we are trying to solve.
>
> We want a tunable for a parameter that is difficult to explain to the user.
>
> To make it easier for our users to use the tunable we are going to
> provide them a way to look at the tunable settings in detail.
>
> Yes, it requires a target system, but we can't avoid that in some cases.
>

Something like this?
  
H.J. Lu June 1, 2020, 10:43 p.m. UTC | #13
On Mon, Jun 1, 2020 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 1, 2020 at 1:57 PM Carlos O'Donell <carlos@redhat.com> wrote:
> >
> > On Mon, Jun 1, 2020 at 4:48 PM Florian Weimer <fw@deneb.enyo.de> wrote:
> > >
> > > * H. J. Lu via Libc-alpha:
> > >
> > > > On Mon, Jun 1, 2020 at 1:15 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >>
> > > >> On Mon, Jun 1, 2020 at 12:38 PM Carlos O'Donell <carlos@redhat.com> wrote:
> > > >> >
> > > >> > On Mon, Jun 1, 2020 at 3:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >> > > Did you mean adding  --list-tunables to ld.so?  libc.so.6 doesn't take
> > > >> > > any arguments.
> > > >> >
> > > >> > Yes, I mean adding argument processing to libc.so.6, and handling
> > > >> > --list-tunables.
> > > >> >
> > > >> > We have enough infrastructure in place that wiring that up shouldn't be too bad?
> > > >> >
> > > >> > Then, even in trimmed down containers, you can just run
> > > >> > /lib64/libc.so.6 --list-tunables and get back the list of tunables and
> > > >> > their min, max, and security values.
> > > >>
> > > >> Adding an argument to libc.so.6 is difficult since argument passing is
> > > >> processor specific.  Adding --list-tunables to ld.so? is more doable.
> > > >
> > > > But tunables are in libc.so.
> > >
> > > If this is really a problem, we can load libc.so and call a
> > > GLIBC_PRIVATE function to print the information.
> >
> > Agreed.
> >
> > Please keep in mind the original problem we are trying to solve.
> >
> > We want a tunable for a parameter that is difficult to explain to the user.
> >
> > To make it easier for our users to use the tunable we are going to
> > provide them a way to look at the tunable settings in detail.
> >
> > Yes, it requires a target system, but we can't avoid that in some cases.
> >
>
> Something like this?
>

Tunables are designed to pass info from user to glibc, not the other
way around.  When __libc_main is called, init_cacheinfo is never
called.  I can call init_cacheinfo from __libc_main.  But there is no
interface to update min and max values from init_cacheinfo.  I don't
think --list-tunables will work here without changes to tunables.
  
Carlos O'Donell June 2, 2020, 2:08 a.m. UTC | #14
On Mon, Jun 1, 2020 at 6:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> Tunables are designed to pass info from user to glibc, not the other
> way around.  When __libc_main is called, init_cacheinfo is never
> called.  I can call init_cacheinfo from __libc_main.  But there is no
> interface to update min and max values from init_cacheinfo.  I don't
> think --list-tunables will work here without changes to tunables.

You have a dynamic threshold.

You have to tell the user what that minimum is, otherwise they can't
use the tunable reliably.

This is the first instance of a min/max that is dynamically determined.

You must fetch the cache info ahead of the tunable initialization, that
is you must call init_cacheinfo before __init_tunables.

You can initialize the tunable data dynamically like this:

/* Dynamically set the min and max of glibc.foo.bar.  */
tunable_id_t id = TUNABLE_ENUM_NAME (glibc, foo, bar);
tunable_list[id].type.min = lowval;
tunable_list[id].type.max = highval;

We do something similar for maybe_enable_malloc_check.

Then once the tunables are parsed, and the cpu features are loaded
you can print the tunables, and the printed tunables will have meaningful
min and max values.

If you have circular dependency, then you must process the cpu features
first without reading from the tunables, then allow the tunables to be
initialized from the system, *then* process the tunables to alter the existing
cpu feature settings.

Cheers,
Carlos.
  
H.J. Lu June 4, 2020, 9 p.m. UTC | #15
On Mon, Jun 1, 2020 at 7:08 PM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On Mon, Jun 1, 2020 at 6:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > Tunables are designed to pass info from user to glibc, not the other
> > way around.  When __libc_main is called, init_cacheinfo is never
> > called.  I can call init_cacheinfo from __libc_main.  But there is no
> > interface to update min and max values from init_cacheinfo.  I don't
> > think --list-tunables will work here without changes to tunables.
>
> You have a dynamic threshold.
>
> You have to tell the user what that minimum is, otherwise they can't
> use the tunable reliably.
>
> This is the first instance of a min/max that is dynamically determined.
>
> You must fetch the cache info ahead of the tunable initialization, that
> is you must call init_cacheinfo before __init_tunables.
>
> You can initialize the tunable data dynamically like this:
>
> /* Dynamically set the min and max of glibc.foo.bar.  */
> tunable_id_t id = TUNABLE_ENUM_NAME (glibc, foo, bar);
> tunable_list[id].type.min = lowval;
> tunable_list[id].type.max = highval;
>
> We do something similar for maybe_enable_malloc_check.
>
> Then once the tunables are parsed, and the cpu features are loaded
> you can print the tunables, and the printed tunables will have meaningful
> min and max values.
>
> If you have circular dependency, then you must process the cpu features
> first without reading from the tunables, then allow the tunables to be
> initialized from the system, *then* process the tunables to alter the existing
> cpu feature settings.
>

How about this?  I got

[hjl@gnu-cfl-2 build-x86_64-linux]$ ./elf/ld.so ./libc.so --list-tunables
tunables:
  glibc.elision.skip_lock_after_retries: 0x3 (min: 0x80000000, max: 0x7fffffff)
  glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.malloc.perturb: 0x0 (min: 0x0, max: 0xff)
  glibc.cpu.x86_shared_cache_size: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.elision.tries: 0x3 (min: 0x80000000, max: 0x7fffffff)
  glibc.elision.enable: 0x0 (min: 0x0, max: 0x1)
  glibc.cpu.x86_rep_movsb_threshold: 0x1000 (min: 0x100, max:
0xffffffffffffffff)
  glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.elision.skip_lock_busy: 0x3 (min: 0x80000000, max: 0x7fffffff)
  glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x0, max: 0xffffffffffffffff)
  glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x0, max:
0xffffffffffffffff)
  glibc.cpu.x86_shstk:
  glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
  glibc.malloc.mmap_max: 0x0 (min: 0x80000000, max: 0x7fffffff)
  glibc.elision.skip_trylock_internal_abort: 0x3 (min: 0x80000000,
max: 0x7fffffff)
  glibc.malloc.tcache_unsorted_limit: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.cpu.x86_ibt:
  glibc.cpu.hwcaps:
  glibc.elision.skip_lock_internal_abort: 0x3 (min: 0x80000000, max: 0x7fffffff)
  glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0xffffffffffffffff)
  glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.cpu.x86_data_cache_size: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.malloc.tcache_count: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0xffffffffffffffff)
  glibc.pthread.mutex_spin_count: 0x64 (min: 0x0, max: 0x7fff)
  glibc.malloc.tcache_max: 0x0 (min: 0x0, max: 0xffffffffffffffff)
  glibc.malloc.check: 0x0 (min: 0x0, max: 0x3)
[hjl@gnu-cfl-2 build-x86_64-linux]$
  

Patch

From 7d2e0c0b843d509716d92960b9b139b32eacea54 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 9 May 2020 11:13:57 -0700
Subject: [PATCH] x86: Add thresholds for "rep movsb/stosb" to tunables

Add x86_rep_movsb_threshold and x86_rep_stosb_threshold to tunables
to update thresholds for "rep movsb" and "rep stosb" at run-time.

Note that the user specified threshold for "rep movsb" smaller than
the minimum threshold will be ignored.
---
 manual/tunables.texi                          | 16 +++++++
 sysdeps/x86/cacheinfo.c                       | 46 +++++++++++++++++++
 sysdeps/x86/cpu-features.c                    |  4 ++
 sysdeps/x86/cpu-features.h                    |  4 ++
 sysdeps/x86/dl-tunables.list                  |  6 +++
 .../multiarch/memmove-vec-unaligned-erms.S    | 16 +------
 .../multiarch/memset-vec-unaligned-erms.S     | 12 +----
 7 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/manual/tunables.texi b/manual/tunables.texi
index ec18b10834..8054f79be0 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -396,6 +396,22 @@  to set threshold in bytes for non temporal store.
 This tunable is specific to i386 and x86-64.
 @end deftp
 
+@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
+The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user
+to set threshold in bytes to start using "rep movsb".  Note that the
+user specified threshold smaller than the minimum threshold will be
+ignored.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+@deftp Tunable glibc.cpu.x86_rep_stosb_threshold
+The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user
+to set threshold in bytes to start using "rep stosb".
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
 @deftp Tunable glibc.cpu.x86_ibt
 The @code{glibc.cpu.x86_ibt} tunable allows the user to control how
 indirect branch tracking (IBT) should be enabled.  Accepted values are
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 311502dee3..4322328a1b 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -530,6 +530,23 @@  long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 /* Threshold to use non temporal store.  */
 long int __x86_shared_non_temporal_threshold attribute_hidden;
 
+/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
+   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
+   memcpy micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP MOVSB becomes faster than SSE2 optimization
+   on processors with Enhanced REP MOVSB.  Since larger register size
+   can move more data with a single load and store, the threshold is
+   higher with larger register size.  */
+long int __x86_rep_movsb_threshold attribute_hidden = 2048;
+
+/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
+   up REP STOSB operation, REP STOSB isn't faster on short data.  The
+   memset micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP STOSB becomes faster on processors with
+   Enhanced REP STOSB.  Since the stored value is fixed, larger register
+   size has minimal impact on threshold.  */
+long int __x86_rep_stosb_threshold attribute_hidden = 2048;
+
 #ifndef DISABLE_PREFETCHW
 /* PREFETCHW support flag for use in memory and string routines.  */
 int __x86_prefetchw attribute_hidden;
@@ -872,6 +889,35 @@  init_cacheinfo (void)
     = (cpu_features->non_temporal_threshold != 0
        ? cpu_features->non_temporal_threshold
        : __x86_shared_cache_size * threads * 3 / 4);
+
+  /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+  unsigned int minimum_rep_movsb_threshold;
+  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  unsigned int rep_movsb_threshold;
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+    {
+      rep_movsb_threshold = 2048 * (64 / 16);
+      minimum_rep_movsb_threshold = 64 * 8;
+    }
+  else if (CPU_FEATURES_ARCH_P (cpu_features,
+				AVX_Fast_Unaligned_Load))
+    {
+      rep_movsb_threshold = 2048 * (32 / 16);
+      minimum_rep_movsb_threshold = 32 * 8;
+    }
+  else
+    {
+      rep_movsb_threshold = 2048 * (16 / 16);
+      minimum_rep_movsb_threshold = 16 * 8;
+    }
+  if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
+    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+  else
+    __x86_rep_movsb_threshold = rep_movsb_threshold;
+
+  if (cpu_features->rep_stosb_threshold)
+    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
 }
 
 #endif
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 916bbf5242..14f847320f 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -564,6 +564,10 @@  no_cpuid:
   TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
   cpu_features->non_temporal_threshold
     = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
+  cpu_features->rep_movsb_threshold
+    = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
+  cpu_features->rep_stosb_threshold
+    = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL);
   cpu_features->data_cache_size
     = TUNABLE_GET (x86_data_cache_size, long int, NULL);
   cpu_features->shared_cache_size
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index f05d5ce158..7410324e83 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -91,6 +91,10 @@  struct cpu_features
   unsigned long int shared_cache_size;
   /* Threshold to use non temporal store.  */
   unsigned long int non_temporal_threshold;
+  /* Threshold to use "rep movsb".  */
+  unsigned long int rep_movsb_threshold;
+  /* Threshold to use "rep stosb".  */
+  unsigned long int rep_stosb_threshold;
 };
 
 /* Used from outside of glibc to get access to the CPU features
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index 251b926ce4..43bf6c2389 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,12 @@  glibc {
     x86_non_temporal_threshold {
       type: SIZE_T
     }
+    x86_rep_movsb_threshold {
+      type: SIZE_T
+    }
+    x86_rep_stosb_threshold {
+      type: SIZE_T
+    }
     x86_data_cache_size {
       type: SIZE_T
     }
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 74953245aa..bd5dc1a3f3 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -56,17 +56,6 @@ 
 # endif
 #endif
 
-/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
-   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
-   memcpy micro benchmark in glibc shows that 2KB is the approximate
-   value above which REP MOVSB becomes faster than SSE2 optimization
-   on processors with Enhanced REP MOVSB.  Since larger register size
-   can move more data with a single load and store, the threshold is
-   higher with larger register size.  */
-#ifndef REP_MOVSB_THRESHOLD
-# define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
-#endif
-
 #ifndef PREFETCH
 # define PREFETCH(addr) prefetcht0 addr
 #endif
@@ -253,9 +242,6 @@  L(movsb):
 	leaq	(%rsi,%rdx), %r9
 	cmpq	%r9, %rdi
 	/* Avoid slow backward REP MOVSB.  */
-# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
-#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
-# endif
 	jb	L(more_8x_vec_backward)
 1:
 	mov	%RDX_LP, %RCX_LP
@@ -331,7 +317,7 @@  L(between_2_3):
 
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
-	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index af2299709c..2bfc95de05 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,16 +58,6 @@ 
 # endif
 #endif
 
-/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
-   up REP STOSB operation, REP STOSB isn't faster on short data.  The
-   memset micro benchmark in glibc shows that 2KB is the approximate
-   value above which REP STOSB becomes faster on processors with
-   Enhanced REP STOSB.  Since the stored value is fixed, larger register
-   size has minimal impact on threshold.  */
-#ifndef REP_STOSB_THRESHOLD
-# define REP_STOSB_THRESHOLD		2048
-#endif
-
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -181,7 +171,7 @@  ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	ret
 
 L(stosb_more_2x_vec):
-	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 	ja	L(stosb)
 #endif
 L(more_2x_vec):
-- 
2.26.2