i386: Add br_mispredict_scale in cost table.
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
Hi,
For later processors, the pipeline went deeper so the penalty for
untaken branch can be larger than before. Add a new parameter
br_mispredict_scale to describe the penalty, and adopt to
noce_max_ifcvt_seq_cost hook to allow longer sequence to be
converted with cmove.
This improves cpu2017 544 with -Ofast -march=native for 14% on P-core
SPR, and 8% on E-core SRF, no other regression observed.
Bootstrapped & regtested on x86-64-pc-linux-gnu.
OK for trunk?
gcc/ChangeLog:
* config/i386/i386.cc (ix86_noce_max_ifcvt_seq_cost): Adjust
cost with ix86_tune_cost->br_mispredict_scale.
* config/i386/i386.h (processor_costs): Add br_mispredict_scale.
* config/i386/x86-tune-costs.h: Add new br_mispredict_scale to
all processor_costs, in which icelake_cost/alderlake_cost
with value COSTS_N_INSNS (2) + 3 and other processor with value
COSTS_N_INSNS (2).
gcc/testsuite/ChangeLog:
* gcc.target/i386/cmov12.c: New test.
---
gcc/config/i386/i386.cc | 8 ++++++-
gcc/config/i386/i386.h | 2 ++
gcc/config/i386/x86-tune-costs.h | 33 ++++++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/cmov12.c | 21 ++++++++++++++++
4 files changed, 63 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/cmov12.c
Comments
On Tue, Jan 7, 2025 at 8:37 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> Hi,
>
> For later processors, the pipeline went deeper so the penalty for
> untaken branch can be larger than before. Add a new parameter
> br_mispredict_scale to describe the penalty, and adopt to
> noce_max_ifcvt_seq_cost hook to allow longer sequence to be
> converted with cmove.
>
> This improves cpu2017 544 with -Ofast -march=native for 14% on P-core
> SPR, and 8% on E-core SRF, no other regression observed.
>
> Bootstrapped & regtested on x86-64-pc-linux-gnu.
>
> OK for trunk?
>
> gcc/ChangeLog:
>
> * config/i386/i386.cc (ix86_noce_max_ifcvt_seq_cost): Adjust
> cost with ix86_tune_cost->br_mispredict_scale.
> * config/i386/i386.h (processor_costs): Add br_mispredict_scale.
> * config/i386/x86-tune-costs.h: Add new br_mispredict_scale to
> all processor_costs, in which icelake_cost/alderlake_cost
> with value COSTS_N_INSNS (2) + 3 and other processor with value
> COSTS_N_INSNS (2).
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/cmov12.c: New test.
LGTM, but please note there are a whole lot of CMOVE issues on x86,
collected in [1] meta-bug, especially [2].
[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85559
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56309
Thanks,
Uros.
> ---
> gcc/config/i386/i386.cc | 8 ++++++-
> gcc/config/i386/i386.h | 2 ++
> gcc/config/i386/x86-tune-costs.h | 33 ++++++++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/cmov12.c | 21 ++++++++++++++++
> 4 files changed, 63 insertions(+), 1 deletion(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/cmov12.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 655335e2f47..11770aa8a50 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -25088,7 +25088,13 @@ ix86_max_noce_ifcvt_seq_cost (edge e)
> return param_max_rtl_if_conversion_unpredictable_cost;
> }
>
> - return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
> + /* For modern machines with deeper pipeline, the penalty for branch
> + misprediction could be higher than before to reset the pipeline
> + slots. Add parameter br_mispredict_scale as a factor to describe
> + the impact of reseting the pipeline. */
> +
> + return BRANCH_COST (true, predictable_p)
> + * ix86_tune_cost->br_mispredict_scale;
> }
>
> /* Return true if SEQ is a good candidate as a replacement for the
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 00626d539a9..e8e528c7811 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -232,6 +232,8 @@ struct processor_costs {
> to be unrolled. */
> const unsigned small_unroll_factor; /* Unroll factor for small loop to
> be unrolled. */
> + const int br_mispredict_scale; /* Branch mispredict scale for ifcvt
> + threshold. */
> };
>
> extern const struct processor_costs *ix86_cost;
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index 56a09f12b94..a4a128cd5dd 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -137,6 +137,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> NULL, /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* Processor costs (relative to an add) */
> @@ -248,6 +249,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> "4", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs i486_memcpy[2] = {
> @@ -360,6 +362,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs pentium_memcpy[2] = {
> @@ -470,6 +473,7 @@ struct processor_costs pentium_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static const
> @@ -573,6 +577,7 @@ struct processor_costs lakemont_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
> @@ -691,6 +696,7 @@ struct processor_costs pentiumpro_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs geode_memcpy[2] = {
> @@ -800,6 +806,7 @@ struct processor_costs geode_cost = {
> NULL, /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs k6_memcpy[2] = {
> @@ -912,6 +919,7 @@ struct processor_costs k6_cost = {
> "32", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* For some reason, Athlon deals better with REP prefix (relative to loops)
> @@ -1025,6 +1033,7 @@ struct processor_costs athlon_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* K8 has optimized REP instruction for medium sized blocks, but for very
> @@ -1147,6 +1156,7 @@ struct processor_costs k8_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
> @@ -1277,6 +1287,7 @@ struct processor_costs amdfam10_cost = {
> "32", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* BDVER has optimized REP instruction for medium sized blocks, but for
> @@ -1400,6 +1411,7 @@ const struct processor_costs bdver_cost = {
> "11", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
>
> @@ -1555,6 +1567,7 @@ struct processor_costs znver1_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
> @@ -1714,6 +1727,7 @@ struct processor_costs znver2_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> struct processor_costs znver3_cost = {
> @@ -1848,6 +1862,7 @@ struct processor_costs znver3_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* This table currently replicates znver3_cost table. */
> @@ -1984,6 +1999,7 @@ struct processor_costs znver4_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* This table currently replicates znver4_cost table. */
> @@ -2137,6 +2153,7 @@ struct processor_costs znver5_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
> @@ -2263,6 +2280,7 @@ struct processor_costs skylake_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* icelake_cost should produce code tuned for Icelake family of CPUs.
> @@ -2391,6 +2409,7 @@ struct processor_costs icelake_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
> };
>
> /* alderlake_cost should produce code tuned for alderlake family of CPUs. */
> @@ -2513,6 +2532,7 @@ struct processor_costs alderlake_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
> };
>
> /* BTVER1 has optimized REP instruction for medium sized blocks, but for
> @@ -2628,6 +2648,7 @@ const struct processor_costs btver1_cost = {
> "11", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs btver2_memcpy[2] = {
> @@ -2740,6 +2761,7 @@ const struct processor_costs btver2_cost = {
> "11", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs pentium4_memcpy[2] = {
> @@ -2851,6 +2873,7 @@ struct processor_costs pentium4_cost = {
> NULL, /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs nocona_memcpy[2] = {
> @@ -2965,6 +2988,7 @@ struct processor_costs nocona_cost = {
> NULL, /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs atom_memcpy[2] = {
> @@ -3077,6 +3101,7 @@ struct processor_costs atom_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs slm_memcpy[2] = {
> @@ -3189,6 +3214,7 @@ struct processor_costs slm_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs tremont_memcpy[2] = {
> @@ -3315,6 +3341,7 @@ struct processor_costs tremont_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> static stringop_algs intel_memcpy[2] = {
> @@ -3427,6 +3454,7 @@ struct processor_costs intel_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
> @@ -3544,6 +3572,7 @@ struct processor_costs lujiazui_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
> @@ -3659,6 +3688,7 @@ struct processor_costs yongfeng_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
> @@ -3774,6 +3804,7 @@ struct processor_costs shijidadao_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
>
> @@ -3897,6 +3928,7 @@ struct processor_costs generic_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> /* core_cost should produce code tuned for Core familly of CPUs. */
> @@ -4025,5 +4057,6 @@ struct processor_costs core_cost = {
> "16", /* Func alignment. */
> 4, /* Small unroll limit. */
> 2, /* Small unroll factor. */
> + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> };
>
> diff --git a/gcc/testsuite/gcc.target/i386/cmov12.c b/gcc/testsuite/gcc.target/i386/cmov12.c
> new file mode 100644
> index 00000000000..87de4f4117b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/cmov12.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx2 -mtune=sapphirerapids" } */
> +/* { dg-final { scan-assembler-times "cmovg" 3 } } */
> +
> +void foo(int *a, int n, int k)
> +{
> + int j, v;
> +
> + v = a[k - 1];
> + while (k <= n / 2) {
> + j = k + k;
> + if ((j < n) && (a[j - 1] < a[j]))
> + j++;
> + if (v >= a[j - 1])
> + break;
> + a[k - 1] = a[j - 1];
> + k = j;
> + }
> + a[k - 1] = v;
> +}
> +
> --
> 2.31.1
>
> LGTM, but please note there are a whole lot of CMOVE issues on x86,
> collected in [1] meta-bug, especially [2].
>
> [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85559
> [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56309
Thanks for noting this, I've verified my patch have no impact for the
test in [2]. We will keep monitoring those cmove issues.
Uros Bizjak <ubizjak@gmail.com> 于2025年1月7日周二 17:34写道:
>
> On Tue, Jan 7, 2025 at 8:37 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
> >
> > Hi,
> >
> > For later processors, the pipeline went deeper so the penalty for
> > untaken branch can be larger than before. Add a new parameter
> > br_mispredict_scale to describe the penalty, and adopt to
> > noce_max_ifcvt_seq_cost hook to allow longer sequence to be
> > converted with cmove.
> >
> > This improves cpu2017 544 with -Ofast -march=native for 14% on P-core
> > SPR, and 8% on E-core SRF, no other regression observed.
> >
> > Bootstrapped & regtested on x86-64-pc-linux-gnu.
> >
> > OK for trunk?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (ix86_noce_max_ifcvt_seq_cost): Adjust
> > cost with ix86_tune_cost->br_mispredict_scale.
> > * config/i386/i386.h (processor_costs): Add br_mispredict_scale.
> > * config/i386/x86-tune-costs.h: Add new br_mispredict_scale to
> > all processor_costs, in which icelake_cost/alderlake_cost
> > with value COSTS_N_INSNS (2) + 3 and other processor with value
> > COSTS_N_INSNS (2).
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/cmov12.c: New test.
>
> LGTM, but please note there are a whole lot of CMOVE issues on x86,
> collected in [1] meta-bug, especially [2].
>
> [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85559
> [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56309
>
> Thanks,
> Uros.
>
> > ---
> > gcc/config/i386/i386.cc | 8 ++++++-
> > gcc/config/i386/i386.h | 2 ++
> > gcc/config/i386/x86-tune-costs.h | 33 ++++++++++++++++++++++++++
> > gcc/testsuite/gcc.target/i386/cmov12.c | 21 ++++++++++++++++
> > 4 files changed, 63 insertions(+), 1 deletion(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/cmov12.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 655335e2f47..11770aa8a50 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -25088,7 +25088,13 @@ ix86_max_noce_ifcvt_seq_cost (edge e)
> > return param_max_rtl_if_conversion_unpredictable_cost;
> > }
> >
> > - return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
> > + /* For modern machines with deeper pipeline, the penalty for branch
> > + misprediction could be higher than before to reset the pipeline
> > + slots. Add parameter br_mispredict_scale as a factor to describe
> > + the impact of reseting the pipeline. */
> > +
> > + return BRANCH_COST (true, predictable_p)
> > + * ix86_tune_cost->br_mispredict_scale;
> > }
> >
> > /* Return true if SEQ is a good candidate as a replacement for the
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 00626d539a9..e8e528c7811 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -232,6 +232,8 @@ struct processor_costs {
> > to be unrolled. */
> > const unsigned small_unroll_factor; /* Unroll factor for small loop to
> > be unrolled. */
> > + const int br_mispredict_scale; /* Branch mispredict scale for ifcvt
> > + threshold. */
> > };
> >
> > extern const struct processor_costs *ix86_cost;
> > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > index 56a09f12b94..a4a128cd5dd 100644
> > --- a/gcc/config/i386/x86-tune-costs.h
> > +++ b/gcc/config/i386/x86-tune-costs.h
> > @@ -137,6 +137,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
> > NULL, /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* Processor costs (relative to an add) */
> > @@ -248,6 +249,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
> > "4", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs i486_memcpy[2] = {
> > @@ -360,6 +362,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs pentium_memcpy[2] = {
> > @@ -470,6 +473,7 @@ struct processor_costs pentium_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static const
> > @@ -573,6 +577,7 @@ struct processor_costs lakemont_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
> > @@ -691,6 +696,7 @@ struct processor_costs pentiumpro_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs geode_memcpy[2] = {
> > @@ -800,6 +806,7 @@ struct processor_costs geode_cost = {
> > NULL, /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs k6_memcpy[2] = {
> > @@ -912,6 +919,7 @@ struct processor_costs k6_cost = {
> > "32", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* For some reason, Athlon deals better with REP prefix (relative to loops)
> > @@ -1025,6 +1033,7 @@ struct processor_costs athlon_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* K8 has optimized REP instruction for medium sized blocks, but for very
> > @@ -1147,6 +1156,7 @@ struct processor_costs k8_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
> > @@ -1277,6 +1287,7 @@ struct processor_costs amdfam10_cost = {
> > "32", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* BDVER has optimized REP instruction for medium sized blocks, but for
> > @@ -1400,6 +1411,7 @@ const struct processor_costs bdver_cost = {
> > "11", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> >
> > @@ -1555,6 +1567,7 @@ struct processor_costs znver1_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
> > @@ -1714,6 +1727,7 @@ struct processor_costs znver2_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > struct processor_costs znver3_cost = {
> > @@ -1848,6 +1862,7 @@ struct processor_costs znver3_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* This table currently replicates znver3_cost table. */
> > @@ -1984,6 +1999,7 @@ struct processor_costs znver4_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* This table currently replicates znver4_cost table. */
> > @@ -2137,6 +2153,7 @@ struct processor_costs znver5_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
> > @@ -2263,6 +2280,7 @@ struct processor_costs skylake_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* icelake_cost should produce code tuned for Icelake family of CPUs.
> > @@ -2391,6 +2409,7 @@ struct processor_costs icelake_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
> > };
> >
> > /* alderlake_cost should produce code tuned for alderlake family of CPUs. */
> > @@ -2513,6 +2532,7 @@ struct processor_costs alderlake_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
> > };
> >
> > /* BTVER1 has optimized REP instruction for medium sized blocks, but for
> > @@ -2628,6 +2648,7 @@ const struct processor_costs btver1_cost = {
> > "11", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs btver2_memcpy[2] = {
> > @@ -2740,6 +2761,7 @@ const struct processor_costs btver2_cost = {
> > "11", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs pentium4_memcpy[2] = {
> > @@ -2851,6 +2873,7 @@ struct processor_costs pentium4_cost = {
> > NULL, /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs nocona_memcpy[2] = {
> > @@ -2965,6 +2988,7 @@ struct processor_costs nocona_cost = {
> > NULL, /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs atom_memcpy[2] = {
> > @@ -3077,6 +3101,7 @@ struct processor_costs atom_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs slm_memcpy[2] = {
> > @@ -3189,6 +3214,7 @@ struct processor_costs slm_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs tremont_memcpy[2] = {
> > @@ -3315,6 +3341,7 @@ struct processor_costs tremont_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > static stringop_algs intel_memcpy[2] = {
> > @@ -3427,6 +3454,7 @@ struct processor_costs intel_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
> > @@ -3544,6 +3572,7 @@ struct processor_costs lujiazui_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
> > @@ -3659,6 +3688,7 @@ struct processor_costs yongfeng_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
> > @@ -3774,6 +3804,7 @@ struct processor_costs shijidadao_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> >
> > @@ -3897,6 +3928,7 @@ struct processor_costs generic_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > /* core_cost should produce code tuned for Core familly of CPUs. */
> > @@ -4025,5 +4057,6 @@ struct processor_costs core_cost = {
> > "16", /* Func alignment. */
> > 4, /* Small unroll limit. */
> > 2, /* Small unroll factor. */
> > + COSTS_N_INSNS (2), /* Branch mispredict scale. */
> > };
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/cmov12.c b/gcc/testsuite/gcc.target/i386/cmov12.c
> > new file mode 100644
> > index 00000000000..87de4f4117b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/cmov12.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mavx2 -mtune=sapphirerapids" } */
> > +/* { dg-final { scan-assembler-times "cmovg" 3 } } */
> > +
> > +void foo(int *a, int n, int k)
> > +{
> > + int j, v;
> > +
> > + v = a[k - 1];
> > + while (k <= n / 2) {
> > + j = k + k;
> > + if ((j < n) && (a[j - 1] < a[j]))
> > + j++;
> > + if (v >= a[j - 1])
> > + break;
> > + a[k - 1] = a[j - 1];
> > + k = j;
> > + }
> > + a[k - 1] = v;
> > +}
> > +
> > --
> > 2.31.1
> >
@@ -25088,7 +25088,13 @@ ix86_max_noce_ifcvt_seq_cost (edge e)
return param_max_rtl_if_conversion_unpredictable_cost;
}
- return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
+ /* For modern machines with deeper pipeline, the penalty for branch
+ misprediction could be higher than before to reset the pipeline
+ slots. Add parameter br_mispredict_scale as a factor to describe
+ the impact of reseting the pipeline. */
+
+ return BRANCH_COST (true, predictable_p)
+ * ix86_tune_cost->br_mispredict_scale;
}
/* Return true if SEQ is a good candidate as a replacement for the
@@ -232,6 +232,8 @@ struct processor_costs {
to be unrolled. */
const unsigned small_unroll_factor; /* Unroll factor for small loop to
be unrolled. */
+ const int br_mispredict_scale; /* Branch mispredict scale for ifcvt
+ threshold. */
};
extern const struct processor_costs *ix86_cost;
@@ -137,6 +137,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
NULL, /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* Processor costs (relative to an add) */
@@ -248,6 +249,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
"4", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs i486_memcpy[2] = {
@@ -360,6 +362,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs pentium_memcpy[2] = {
@@ -470,6 +473,7 @@ struct processor_costs pentium_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static const
@@ -573,6 +577,7 @@ struct processor_costs lakemont_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -691,6 +696,7 @@ struct processor_costs pentiumpro_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs geode_memcpy[2] = {
@@ -800,6 +806,7 @@ struct processor_costs geode_cost = {
NULL, /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs k6_memcpy[2] = {
@@ -912,6 +919,7 @@ struct processor_costs k6_cost = {
"32", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -1025,6 +1033,7 @@ struct processor_costs athlon_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -1147,6 +1156,7 @@ struct processor_costs k8_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -1277,6 +1287,7 @@ struct processor_costs amdfam10_cost = {
"32", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* BDVER has optimized REP instruction for medium sized blocks, but for
@@ -1400,6 +1411,7 @@ const struct processor_costs bdver_cost = {
"11", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
@@ -1555,6 +1567,7 @@ struct processor_costs znver1_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
@@ -1714,6 +1727,7 @@ struct processor_costs znver2_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
struct processor_costs znver3_cost = {
@@ -1848,6 +1862,7 @@ struct processor_costs znver3_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* This table currently replicates znver3_cost table. */
@@ -1984,6 +1999,7 @@ struct processor_costs znver4_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* This table currently replicates znver4_cost table. */
@@ -2137,6 +2153,7 @@ struct processor_costs znver5_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
@@ -2263,6 +2280,7 @@ struct processor_costs skylake_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* icelake_cost should produce code tuned for Icelake family of CPUs.
@@ -2391,6 +2409,7 @@ struct processor_costs icelake_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
};
/* alderlake_cost should produce code tuned for alderlake family of CPUs. */
@@ -2513,6 +2532,7 @@ struct processor_costs alderlake_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */
};
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
@@ -2628,6 +2648,7 @@ const struct processor_costs btver1_cost = {
"11", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs btver2_memcpy[2] = {
@@ -2740,6 +2761,7 @@ const struct processor_costs btver2_cost = {
"11", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs pentium4_memcpy[2] = {
@@ -2851,6 +2873,7 @@ struct processor_costs pentium4_cost = {
NULL, /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs nocona_memcpy[2] = {
@@ -2965,6 +2988,7 @@ struct processor_costs nocona_cost = {
NULL, /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs atom_memcpy[2] = {
@@ -3077,6 +3101,7 @@ struct processor_costs atom_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs slm_memcpy[2] = {
@@ -3189,6 +3214,7 @@ struct processor_costs slm_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs tremont_memcpy[2] = {
@@ -3315,6 +3341,7 @@ struct processor_costs tremont_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
static stringop_algs intel_memcpy[2] = {
@@ -3427,6 +3454,7 @@ struct processor_costs intel_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
@@ -3544,6 +3572,7 @@ struct processor_costs lujiazui_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
@@ -3659,6 +3688,7 @@ struct processor_costs yongfeng_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
@@ -3774,6 +3804,7 @@ struct processor_costs shijidadao_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
@@ -3897,6 +3928,7 @@ struct processor_costs generic_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
/* core_cost should produce code tuned for Core familly of CPUs. */
@@ -4025,5 +4057,6 @@ struct processor_costs core_cost = {
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx2 -mtune=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "cmovg" 3 } } */
+
+void foo(int *a, int n, int k)
+{
+ int j, v;
+
+ v = a[k - 1];
+ while (k <= n / 2) {
+ j = k + k;
+ if ((j < n) && (a[j - 1] < a[j]))
+ j++;
+ if (v >= a[j - 1])
+ break;
+ a[k - 1] = a[j - 1];
+ k = j;
+ }
+ a[k - 1] = v;
+}
+