aarch64: modify Ampere CPU tunings on reassociation/FMA
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Testing passed
|
Commit Message
This patch modifies tunings for ampere1/ampere1a/ampere1b, to:
1. Allow reassociation on FP additions.
2. Avoid generating loop-dependant FMA chains. Added a tuning
option for this.
Bootstrapped and tested. Is this ok for trunk?
Thanks,
Di Zhao
gcc/ChangeLog:
* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
New tuing option to avoid cross-loop FMA.
* config/aarch64/aarch64.cc (aarch64_override_options_internal): Set
param_avoid_fma_max_bits according to tuning option.
* config/aarch64/tuning_models/ampere1.h: Modify tunings related with
FMA.
* config/aarch64/tuning_models/ampere1a.h: Modify tunings related with
FMA.
* config/aarch64/tuning_models/ampere1b.h: Modify tunings related with
FMA.
---
gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++
gcc/config/aarch64/aarch64.cc | 6 ++++++
gcc/config/aarch64/tuning_models/ampere1.h | 2 +-
gcc/config/aarch64/tuning_models/ampere1a.h | 4 ++--
gcc/config/aarch64/tuning_models/ampere1b.h | 5 +++--
5 files changed, 14 insertions(+), 5 deletions(-)
Comments
On 30/11/2023 08:27, Di Zhao OS wrote:
> This patch modifies tunings for ampere1/ampere1a/ampere1b, to:
>
> 1. Allow reassociation on FP additions.
> 2. Avoid generating loop-dependant FMA chains. Added a tuning
> option for this.
>
> Bootstrapped and tested. Is this ok for trunk?
>
> Thanks,
> Di Zhao
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
> New tuing option to avoid cross-loop FMA.
typo: tuning
> * config/aarch64/aarch64.cc (aarch64_override_options_internal): Set
> param_avoid_fma_max_bits according to tuning option.
> * config/aarch64/tuning_models/ampere1.h: Modify tunings related with
> FMA.
> * config/aarch64/tuning_models/ampere1a.h: Modify tunings related with
> FMA.
> * config/aarch64/tuning_models/ampere1b.h: Modify tunings related with
> FMA.
>
You need to mention the name of the structure you're modifying. Also we
usually just write 'Likewise.' if the change is identical in effect to
the change immediately above, so
* config/aarch64/tuning_models/ampere1.h (ampere1_tunings):
Modify tunings related with FMA.
* config/aarch64/tuning_models/ampere1a.h (ampere1a_tunings):
Likewise.
* config/aarch64/tuning_models/ampere1b.h (ampere1b_tunings):
Likewise.
Finally, watch your line length. The total length of the line should
not go beyond column 72 in commit log entries, unless that involves
breaking a single word on a line.
Otherwise, this is OK.
R.
> ---
> gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++
> gcc/config/aarch64/aarch64.cc | 6 ++++++
> gcc/config/aarch64/tuning_models/ampere1.h | 2 +-
> gcc/config/aarch64/tuning_models/ampere1a.h | 4 ++--
> gcc/config/aarch64/tuning_models/ampere1b.h | 5 +++--
> 5 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
> index 774568e9106..f28a73839a6 100644
> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> @@ -47,4 +47,6 @@ AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
>
> AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT)
>
> +AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
> +
> #undef AARCH64_EXTRA_TUNING_OPTION
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 64684258b7b..28bc70a787f 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -16083,6 +16083,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
> && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
> opts->x_flag_prefetch_loop_arrays = 1;
>
> + /* Avoid loop-dependant FMA chains. */
> + if (aarch64_tune_params.extra_tuning_flags
> + & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
> + SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
> + 512);
> +
> aarch64_override_options_after_change_1 (opts);
> }
>
> diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
> index 8d2a1c69610..a144e8f94b3 100644
> --- a/gcc/config/aarch64/tuning_models/ampere1.h
> +++ b/gcc/config/aarch64/tuning_models/ampere1.h
> @@ -104,7 +104,7 @@ static const struct tune_params ampere1_tunings =
> 2, /* min_div_recip_mul_df. */
> 0, /* max_case_values. */
> tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
> + (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
> &ere1_prefetch_tune,
> AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
> AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
> diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
> index c419ffb3c1a..f688ed08a79 100644
> --- a/gcc/config/aarch64/tuning_models/ampere1a.h
> +++ b/gcc/config/aarch64/tuning_models/ampere1a.h
> @@ -50,13 +50,13 @@ static const struct tune_params ampere1a_tunings =
> "32:16", /* loop_align. */
> 2, /* int_reassoc_width. */
> 4, /* fp_reassoc_width. */
> - 1, /* fma_reassoc_width. */
> + 4, /* fma_reassoc_width. */
> 2, /* vec_reassoc_width. */
> 2, /* min_div_recip_mul_sf. */
> 2, /* min_div_recip_mul_df. */
> 0, /* max_case_values. */
> tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
> + (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
> &ere1_prefetch_tune,
> AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
> AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
> diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h
> index c4928f50d29..a98b6a980f7 100644
> --- a/gcc/config/aarch64/tuning_models/ampere1b.h
> +++ b/gcc/config/aarch64/tuning_models/ampere1b.h
> @@ -99,13 +99,14 @@ static const struct tune_params ampere1b_tunings =
> "32:16", /* loop_align. */
> 2, /* int_reassoc_width. */
> 4, /* fp_reassoc_width. */
> - 1, /* fma_reassoc_width. */
> + 4, /* fma_reassoc_width. */
> 2, /* vec_reassoc_width. */
> 2, /* min_div_recip_mul_sf. */
> 2, /* min_div_recip_mul_df. */
> 0, /* max_case_values. */
> tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
> - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
> + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
> + AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
> &ere1b_prefetch_tune,
> AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
> AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
Hello Richard,
Thank you for the review. Fixed the problems and committed to master.
Thanks,
Di
> -----Original Message-----
> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> Sent: Thursday, November 30, 2023 8:21 PM
> To: Di Zhao OS <dizhao@os.amperecomputing.com>; gcc-patches@gcc.gnu.org
> Cc: Philipp Tomsich <philipp.tomsich@vrull.eu>
> Subject: Re: [PATCH] aarch64: modify Ampere CPU tunings on reassociation/FMA
>
>
>
> On 30/11/2023 08:27, Di Zhao OS wrote:
> > This patch modifies tunings for ampere1/ampere1a/ampere1b, to:
> >
> > 1. Allow reassociation on FP additions.
> > 2. Avoid generating loop-dependant FMA chains. Added a tuning
> > option for this.
> >
> > Bootstrapped and tested. Is this ok for trunk?
> >
> > Thanks,
> > Di Zhao
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-tuning-flags.def
> (AARCH64_EXTRA_TUNING_OPTION):
> > New tuing option to avoid cross-loop FMA.
>
> typo: tuning
>
> > * config/aarch64/aarch64.cc (aarch64_override_options_internal):
> Set
> > param_avoid_fma_max_bits according to tuning option.
> > * config/aarch64/tuning_models/ampere1.h: Modify tunings related
> with
> > FMA.
> > * config/aarch64/tuning_models/ampere1a.h: Modify tunings related
> with
> > FMA.
> > * config/aarch64/tuning_models/ampere1b.h: Modify tunings related
> with
> > FMA.
> >
>
> You need to mention the name of the structure you're modifying. Also we
> usually just write 'Likewise.' if the change is identical in effect to
> the change immediately above, so
>
> * config/aarch64/tuning_models/ampere1.h (ampere1_tunings):
> Modify tunings related with FMA.
> * config/aarch64/tuning_models/ampere1a.h (ampere1a_tunings):
> Likewise.
> * config/aarch64/tuning_models/ampere1b.h (ampere1b_tunings):
> Likewise.
>
> Finally, watch your line length. The total length of the line should
> not go beyond column 72 in commit log entries, unless that involves
> breaking a single word on a line.
>
> Otherwise, this is OK.
>
> R.
>
> > ---
> > gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++
> > gcc/config/aarch64/aarch64.cc | 6 ++++++
> > gcc/config/aarch64/tuning_models/ampere1.h | 2 +-
> > gcc/config/aarch64/tuning_models/ampere1a.h | 4 ++--
> > gcc/config/aarch64/tuning_models/ampere1b.h | 5 +++--
> > 5 files changed, 14 insertions(+), 5 deletions(-)
> >
> > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def
> b/gcc/config/aarch64/aarch64-tuning-flags.def
> > index 774568e9106..f28a73839a6 100644
> > --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> > @@ -47,4 +47,6 @@ AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs",
> USE_NEW_VECTOR_COSTS)
> >
> > AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput",
> MATCHED_VECTOR_THROUGHPUT)
> >
> > +AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
> > +
> > #undef AARCH64_EXTRA_TUNING_OPTION
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 64684258b7b..28bc70a787f 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -16083,6 +16083,12 @@ aarch64_override_options_internal (struct
> gcc_options *opts)
> > && opts->x_optimize >= aarch64_tune_params.prefetch-
> >default_opt_level)
> > opts->x_flag_prefetch_loop_arrays = 1;
> >
> > + /* Avoid loop-dependant FMA chains. */
> > + if (aarch64_tune_params.extra_tuning_flags
> > + & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
> > + SET_OPTION_IF_UNSET (opts, &global_options_set,
> param_avoid_fma_max_bits,
> > + 512);
> > +
> > aarch64_override_options_after_change_1 (opts);
> > }
> >
> > diff --git a/gcc/config/aarch64/tuning_models/ampere1.h
> b/gcc/config/aarch64/tuning_models/ampere1.h
> > index 8d2a1c69610..a144e8f94b3 100644
> > --- a/gcc/config/aarch64/tuning_models/ampere1.h
> > +++ b/gcc/config/aarch64/tuning_models/ampere1.h
> > @@ -104,7 +104,7 @@ static const struct tune_params ampere1_tunings =
> > 2, /* min_div_recip_mul_df. */
> > 0, /* max_case_values. */
> > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> > - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
> > + (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
> > &ere1_prefetch_tune,
> > AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
> > AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
> > diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h
> b/gcc/config/aarch64/tuning_models/ampere1a.h
> > index c419ffb3c1a..f688ed08a79 100644
> > --- a/gcc/config/aarch64/tuning_models/ampere1a.h
> > +++ b/gcc/config/aarch64/tuning_models/ampere1a.h
> > @@ -50,13 +50,13 @@ static const struct tune_params ampere1a_tunings =
> > "32:16", /* loop_align. */
> > 2, /* int_reassoc_width. */
> > 4, /* fp_reassoc_width. */
> > - 1, /* fma_reassoc_width. */
> > + 4, /* fma_reassoc_width. */
> > 2, /* vec_reassoc_width. */
> > 2, /* min_div_recip_mul_sf. */
> > 2, /* min_div_recip_mul_df. */
> > 0, /* max_case_values. */
> > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> > - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
> > + (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
> > &ere1_prefetch_tune,
> > AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
> > AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
> > diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h
> b/gcc/config/aarch64/tuning_models/ampere1b.h
> > index c4928f50d29..a98b6a980f7 100644
> > --- a/gcc/config/aarch64/tuning_models/ampere1b.h
> > +++ b/gcc/config/aarch64/tuning_models/ampere1b.h
> > @@ -99,13 +99,14 @@ static const struct tune_params ampere1b_tunings =
> > "32:16", /* loop_align. */
> > 2, /* int_reassoc_width. */
> > 4, /* fp_reassoc_width. */
> > - 1, /* fma_reassoc_width. */
> > + 4, /* fma_reassoc_width. */
> > 2, /* vec_reassoc_width. */
> > 2, /* min_div_recip_mul_sf. */
> > 2, /* min_div_recip_mul_df. */
> > 0, /* max_case_values. */
> > tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
> > - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
> > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
> > + AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
> > &ere1b_prefetch_tune,
> > AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
> > AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
@@ -47,4 +47,6 @@ AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT)
+AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
+
#undef AARCH64_EXTRA_TUNING_OPTION
@@ -16083,6 +16083,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
&& opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
opts->x_flag_prefetch_loop_arrays = 1;
+ /* Avoid loop-dependant FMA chains. */
+ if (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
+ SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
+ 512);
+
aarch64_override_options_after_change_1 (opts);
}
@@ -104,7 +104,7 @@ static const struct tune_params ampere1_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
&ere1_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
@@ -50,13 +50,13 @@ static const struct tune_params ampere1a_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
+ 4, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
&ere1_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */
@@ -99,13 +99,14 @@ static const struct tune_params ampere1b_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
+ 4, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
+ AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */
&ere1b_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */
AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */