aarch64: add 'AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA'

Message ID SN6PR01MB4240670EE838D161E6A3810BE89FA@SN6PR01MB4240.prod.exchangelabs.com
State New
Headers
Series aarch64: add 'AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA' |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Testing passed

Commit Message

Di Zhao OS Dec. 27, 2023, 10:40 a.m. UTC
  This patch adds a new tuning option 'AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA',
to consider fully pipelined FMAs in reassociation. Also, set this option
by default for Ampere CPUs.

Tested on aarch64-unknown-linux-gnu. Is this OK for trunk?

Thanks,
Di Zhao

gcc/ChangeLog:

	* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
	New tuning option AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA.
	* config/aarch64/aarch64.cc (aarch64_override_options_internal): Set
	param_fully_pipelined_fma according to tuning option.
	* config/aarch64/tuning_models/ampere1.h: Add
	AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA to tune_flags.
	* config/aarch64/tuning_models/ampere1a.h: Likewise.
	* config/aarch64/tuning_models/ampere1b.h: Likewise.

---
 gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++
 gcc/config/aarch64/aarch64.cc               | 6 ++++++
 gcc/config/aarch64/tuning_models/ampere1.h  | 3 ++-
 gcc/config/aarch64/tuning_models/ampere1a.h | 3 ++-
 gcc/config/aarch64/tuning_models/ampere1b.h | 3 ++-
 5 files changed, 14 insertions(+), 3 deletions(-)
  

Comments

Richard Sandiford Dec. 29, 2023, 10:23 a.m. UTC | #1
Di Zhao OS <dizhao@os.amperecomputing.com> writes:
> This patch adds a new tuning option 'AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA',
> to consider fully pipelined FMAs in reassociation. Also, set this option
> by default for Ampere CPUs.
>
> Tested on aarch64-unknown-linux-gnu. Is this OK for trunk?
>
> Thanks,
> Di Zhao
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
> 	New tuning option AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA.
> 	* config/aarch64/aarch64.cc (aarch64_override_options_internal): Set
> 	param_fully_pipelined_fma according to tuning option.
> 	* config/aarch64/tuning_models/ampere1.h: Add
> 	AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA to tune_flags.
> 	* config/aarch64/tuning_models/ampere1a.h: Likewise.
> 	* config/aarch64/tuning_models/ampere1b.h: Likewise.
>
> ---
>  gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++
>  gcc/config/aarch64/aarch64.cc               | 6 ++++++
>  gcc/config/aarch64/tuning_models/ampere1.h  | 3 ++-
>  gcc/config/aarch64/tuning_models/ampere1a.h | 3 ++-
>  gcc/config/aarch64/tuning_models/ampere1b.h | 3 ++-
>  5 files changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
> index f28a73839a6..256f17bad60 100644
> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> @@ -49,4 +49,6 @@ AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGH
>  
>  AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
>  
> +AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_FMA", FULLY_PIPELINED_FMA)

Could you change this to all-lowercase, i.e. fully_pipelined_fma,
for consistency with avoid_cross_loop_fma above?

> +
>  #undef AARCH64_EXTRA_TUNING_OPTION
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f9850320f61..1b3b288cdf9 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -18289,6 +18289,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
>      SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
>  			 512);
>  
> +  /* Consider fully pipelined FMA in reassociation.  */
> +  if (aarch64_tune_params.extra_tuning_flags
> +      & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
> +    SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
> +			 1);
> +
>    aarch64_override_options_after_change_1 (opts);
>  }
>  
> diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
> index a144e8f94b3..d63788528a7 100644
> --- a/gcc/config/aarch64/tuning_models/ampere1.h
> +++ b/gcc/config/aarch64/tuning_models/ampere1.h
> @@ -104,7 +104,8 @@ static const struct tune_params ampere1_tunings =
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
> +   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */

Formatting nit, but GCC style is to put the "|" at the start of the
following line:

  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA
   | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */

Same for the others.

OK with those changes, thanks.

Richard

>    &ampere1_prefetch_tune,
>    AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
>    AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
> index f688ed08a79..63506e1d1c6 100644
> --- a/gcc/config/aarch64/tuning_models/ampere1a.h
> +++ b/gcc/config/aarch64/tuning_models/ampere1a.h
> @@ -56,7 +56,8 @@ static const struct tune_params ampere1a_tunings =
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
> +   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
>    &ampere1_prefetch_tune,
>    AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
>    AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h
> index a98b6a980f7..7894e730174 100644
> --- a/gcc/config/aarch64/tuning_models/ampere1b.h
> +++ b/gcc/config/aarch64/tuning_models/ampere1b.h
> @@ -106,7 +106,8 @@ static const struct tune_params ampere1b_tunings =
>    0,	/* max_case_values.  */
>    tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
> -   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
> +   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
> +   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
>    &ampere1b_prefetch_tune,
>    AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
>    AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
  
Di Zhao OS Jan. 3, 2024, 6:53 a.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Friday, December 29, 2023 6:24 PM
> To: Di Zhao OS <dizhao@os.amperecomputing.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] aarch64: add 'AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA'
> 
> Di Zhao OS <dizhao@os.amperecomputing.com> writes:
> > This patch adds a new tuning option 'AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA',
> > to consider fully pipelined FMAs in reassociation. Also, set this option
> > by default for Ampere CPUs.
> >
> > Tested on aarch64-unknown-linux-gnu. Is this OK for trunk?
> >
> > Thanks,
> > Di Zhao
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
> > 	New tuning option AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA.
> > 	* config/aarch64/aarch64.cc (aarch64_override_options_internal): Set
> > 	param_fully_pipelined_fma according to tuning option.
> > 	* config/aarch64/tuning_models/ampere1.h: Add
> > 	AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA to tune_flags.
> > 	* config/aarch64/tuning_models/ampere1a.h: Likewise.
> > 	* config/aarch64/tuning_models/ampere1b.h: Likewise.
> >
> > ---
> >  gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++
> >  gcc/config/aarch64/aarch64.cc               | 6 ++++++
> >  gcc/config/aarch64/tuning_models/ampere1.h  | 3 ++-
> >  gcc/config/aarch64/tuning_models/ampere1a.h | 3 ++-
> >  gcc/config/aarch64/tuning_models/ampere1b.h | 3 ++-
> >  5 files changed, 14 insertions(+), 3 deletions(-)
> >
> > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def
> b/gcc/config/aarch64/aarch64-tuning-flags.def
> > index f28a73839a6..256f17bad60 100644
> > --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> > @@ -49,4 +49,6 @@ AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput",
> MATCHED_VECTOR_THROUGH
> >
> >  AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
> >
> > +AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_FMA", FULLY_PIPELINED_FMA)
> 
> Could you change this to all-lowercase, i.e. fully_pipelined_fma,
> for consistency with avoid_cross_loop_fma above?
> 
> > +
> >  #undef AARCH64_EXTRA_TUNING_OPTION
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index f9850320f61..1b3b288cdf9 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -18289,6 +18289,12 @@ aarch64_override_options_internal (struct
> gcc_options *opts)
> >      SET_OPTION_IF_UNSET (opts, &global_options_set,
> param_avoid_fma_max_bits,
> >  			 512);
> >
> > +  /* Consider fully pipelined FMA in reassociation.  */
> > +  if (aarch64_tune_params.extra_tuning_flags
> > +      & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
> > +    SET_OPTION_IF_UNSET (opts, &global_options_set,
> param_fully_pipelined_fma,
> > +			 1);
> > +
> >    aarch64_override_options_after_change_1 (opts);
> >  }
> >
> > diff --git a/gcc/config/aarch64/tuning_models/ampere1.h
> b/gcc/config/aarch64/tuning_models/ampere1.h
> > index a144e8f94b3..d63788528a7 100644
> > --- a/gcc/config/aarch64/tuning_models/ampere1.h
> > +++ b/gcc/config/aarch64/tuning_models/ampere1.h
> > @@ -104,7 +104,8 @@ static const struct tune_params ampere1_tunings =
> >    2,	/* min_div_recip_mul_df.  */
> >    0,	/* max_case_values.  */
> >    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> > -  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
> > +  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
> > +   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
> 
> Formatting nit, but GCC style is to put the "|" at the start of the
> following line:
> 
>   (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA
>    | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
> 
> Same for the others.
> 
> OK with those changes, thanks.

Fixed the problems and committed to master.

Thanks,
Di

> 
> Richard
> 
> >    &ampere1_prefetch_tune,
> >    AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> >    AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> > diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h
> b/gcc/config/aarch64/tuning_models/ampere1a.h
> > index f688ed08a79..63506e1d1c6 100644
> > --- a/gcc/config/aarch64/tuning_models/ampere1a.h
> > +++ b/gcc/config/aarch64/tuning_models/ampere1a.h
> > @@ -56,7 +56,8 @@ static const struct tune_params ampere1a_tunings =
> >    2,	/* min_div_recip_mul_df.  */
> >    0,	/* max_case_values.  */
> >    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> > -  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
> > +  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
> > +   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
> >    &ampere1_prefetch_tune,
> >    AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> >    AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> > diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h
> b/gcc/config/aarch64/tuning_models/ampere1b.h
> > index a98b6a980f7..7894e730174 100644
> > --- a/gcc/config/aarch64/tuning_models/ampere1b.h
> > +++ b/gcc/config/aarch64/tuning_models/ampere1b.h
> > @@ -106,7 +106,8 @@ static const struct tune_params ampere1b_tunings =
> >    0,	/* max_case_values.  */
> >    tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
> >    (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
> > -   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
> > +   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
> > +   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
> >    &ampere1b_prefetch_tune,
> >    AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> >    AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
  

Patch

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index f28a73839a6..256f17bad60 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -49,4 +49,6 @@  AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGH
 
 AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
 
+AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_FMA", FULLY_PIPELINED_FMA)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f9850320f61..1b3b288cdf9 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18289,6 +18289,12 @@  aarch64_override_options_internal (struct gcc_options *opts)
     SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
 			 512);
 
+  /* Consider fully pipelined FMA in reassociation.  */
+  if (aarch64_tune_params.extra_tuning_flags
+      & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
+    SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
+			 1);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
index a144e8f94b3..d63788528a7 100644
--- a/gcc/config/aarch64/tuning_models/ampere1.h
+++ b/gcc/config/aarch64/tuning_models/ampere1.h
@@ -104,7 +104,8 @@  static const struct tune_params ampere1_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
+   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
   &ampere1_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
index f688ed08a79..63506e1d1c6 100644
--- a/gcc/config/aarch64/tuning_models/ampere1a.h
+++ b/gcc/config/aarch64/tuning_models/ampere1a.h
@@ -56,7 +56,8 @@  static const struct tune_params ampere1a_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
+   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
   &ampere1_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h
index a98b6a980f7..7894e730174 100644
--- a/gcc/config/aarch64/tuning_models/ampere1b.h
+++ b/gcc/config/aarch64/tuning_models/ampere1b.h
@@ -106,7 +106,8 @@  static const struct tune_params ampere1b_tunings =
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
-   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),	/* tune_flags.  */
+   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA |
+   AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
   &ampere1b_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */