[v2] ira: Add a target hook for callee-saved register cost scale
Checks
| Context |
Check |
Description |
| linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
| linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Test passed
|
| linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
| linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
Date: Tue Jun 25 08:37:49 2024 -0500
ira: Scale save/restore costs of callee save registers with block frequency
scales the cost of saving/restoring a callee-save hard register in epilogue
and prologue with the entry block frequency, which, if not optimizing for
size, is 10000, for all targets. As the result, callee-saved registers
may not be used to preserve local variable values across calls on some
targets, like x86. Add a target hook for the callee-saved register cost
scale in epilogue and prologue used by IRA. The default version of this
target hook returns 1 if optimizing for size, otherwise returns the entry
block frequency. Add an x86 version of this target hook to restore the
old behavior prior to the above commit.
PR rtl-optimization/111673
PR rtl-optimization/115932
PR rtl-optimization/116028
PR rtl-optimization/117081
PR rtl-optimization/117082
PR rtl-optimization/118497
* ira-color.cc (assign_hard_reg): Call the target hook for the
callee-saved register cost scale in epilogue and prologue.
* target.def (ira_callee_saved_register_cost_scale): New target
hook.
* targhooks.cc (default_ira_callee_saved_register_cost_scale):
New.
* targhooks.h (default_ira_callee_saved_register_cost_scale):
Likewise.
* config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
New.
(TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
* doc/tm.texi: Regenerated.
* doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
New.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
gcc/config/i386/i386.cc | 11 +++++++++++
gcc/doc/tm.texi | 8 ++++++++
gcc/doc/tm.texi.in | 2 ++
gcc/ira-color.cc | 3 +--
gcc/target.def | 12 ++++++++++++
gcc/targhooks.cc | 8 ++++++++
gcc/targhooks.h | 1 +
7 files changed, 43 insertions(+), 2 deletions(-)
Comments
On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets. As the result, callee-saved registers
> may not be used to preserve local variable values across calls on some
> targets, like x86. Add a target hook for the callee-saved register cost
> scale in epilogue and prologue used by IRA. The default version of this
> target hook returns 1 if optimizing for size, otherwise returns the entry
> block frequency. Add an x86 version of this target hook to restore the
> old behavior prior to the above commit.
>
> PR rtl-optimization/111673
> PR rtl-optimization/115932
> PR rtl-optimization/116028
> PR rtl-optimization/117081
> PR rtl-optimization/117082
> PR rtl-optimization/118497
> * ira-color.cc (assign_hard_reg): Call the target hook for the
> callee-saved register cost scale in epilogue and prologue.
> * target.def (ira_callee_saved_register_cost_scale): New target
> hook.
> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> New.
> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> New.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> * doc/tm.texi: Regenerated.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> New.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386.cc | 11 +++++++++++
> gcc/doc/tm.texi | 8 ++++++++
> gcc/doc/tm.texi.in | 2 ++
> gcc/ira-color.cc | 3 +--
> gcc/target.def | 12 ++++++++++++
> gcc/targhooks.cc | 8 ++++++++
> gcc/targhooks.h | 1 +
> 7 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f89201684a8..3128973ba79 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +
> +static int
> +ix86_ira_callee_saved_register_cost_scale (int)
> +{
> + return 1;
> +}
> +
> /* Return true if a set of DST by the expression SRC should be allowed.
> This prevents complex sets of likely_spilled hard regs before split1. */
>
> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> + ix86_ira_callee_saved_register_cost_scale
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..9f42913a4ef 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> The default version of this target hook always returns given class.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> +A target hook which returns the callee-saved register @var{hard_regno}
> +cost scale in epilogue and prologue used by IRA.
> +
> +The default version of this target hook returns 1 if optimizing for
> +size, otherwise returns the entry block frequency.
> +@end deftypefn
> +
> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> A target hook which returns true if we use LRA instead of reload pass.
>
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..6dbe22581ca 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -2388,6 +2388,8 @@ in the reload pass.
>
> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>
> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +
> @hook TARGET_LRA_P
>
> @hook TARGET_REGISTER_PRIORITY
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..233060e1587 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> cost += add_cost;
> full_cost += add_cost;
> }
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..c348b15815a 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -5714,6 +5714,18 @@ DEFHOOK
> reg_class_t, (int, reg_class_t, reg_class_t),
> default_ira_change_pseudo_allocno_class)
>
> +/* Scale of callee-saved register cost in epilogue and prologue used by
> + IRA. */
> +DEFHOOK
> +(ira_callee_saved_register_cost_scale,
> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> +cost scale in epilogue and prologue used by IRA.\n\
> +\n\
> +The default version of this target hook returns 1 if optimizing for\n\
> +size, otherwise returns the entry block frequency.",
> + int, (int hard_regno),
> + default_ira_callee_saved_register_cost_scale)
> +
> /* Return true if we use LRA instead of reload. */
> DEFHOOK
> (lra_p,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..344075efa41 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> return cl;
> }
>
> +int
> +default_ira_callee_saved_register_cost_scale (int)
> +{
> + return (optimize_size
> + ? 1
Why use optimize_size when REG_FREQ_FROM_BB uses
((optimize_function_for_size_p (cfun) \
|| !cfun->cfg->count_max.initialized_p ())
? You are changing behavior for all targets this way? Did you consider
altering REG_FREQ_FROM_BB to
#define REG_FREQ_FROM_BB(bb, size_value)
((optimize_function_for_size_p (cfun) \
|| !cfun->cfg->count_max.initialized_p ()) \
? default \
: ((bb)->count.to_frequency (cfun) \
* REG_FREQ_MAX / BB_FREQ_MAX) \
? ((bb)->count.to_frequency (cfun) \
* REG_FREQ_MAX / BB_FREQ_MAX) \
: 1)
so return a specified value for the optimize_size case? (see the
other thread where
I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
IMO at this point a new target hook should preserve existing behavior by default
or alternatively the original patch should be reverted as causing
regressions and
a new patch introducing the target hook should be installed in next stage1.
Richard.
> + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +}
> +
> extern bool
> default_lra_p (void)
> {
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..8871e01430c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> reg_class_t);
> +extern int default_ira_callee_saved_register_cost_scale (int);
> extern bool default_lra_p (void);
> extern int default_register_priority (int);
> extern bool default_register_usage_leveling_p (void);
> --
> 2.48.1
>
On Mon, Feb 3, 2025 at 5:27 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > Date: Tue Jun 25 08:37:49 2024 -0500
> >
> > ira: Scale save/restore costs of callee save registers with block frequency
> >
> > scales the cost of saving/restoring a callee-save hard register in epilogue
> > and prologue with the entry block frequency, which, if not optimizing for
> > size, is 10000, for all targets. As the result, callee-saved registers
> > may not be used to preserve local variable values across calls on some
> > targets, like x86. Add a target hook for the callee-saved register cost
> > scale in epilogue and prologue used by IRA. The default version of this
> > target hook returns 1 if optimizing for size, otherwise returns the entry
> > block frequency. Add an x86 version of this target hook to restore the
> > old behavior prior to the above commit.
> >
> > PR rtl-optimization/111673
> > PR rtl-optimization/115932
> > PR rtl-optimization/116028
> > PR rtl-optimization/117081
> > PR rtl-optimization/117082
> > PR rtl-optimization/118497
> > * ira-color.cc (assign_hard_reg): Call the target hook for the
> > callee-saved register cost scale in epilogue and prologue.
> > * target.def (ira_callee_saved_register_cost_scale): New target
> > hook.
> > * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> > New.
> > * targhooks.h (default_ira_callee_saved_register_cost_scale):
> > Likewise.
> > * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> > New.
> > (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> > * doc/tm.texi: Regenerated.
> > * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> > New.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > gcc/config/i386/i386.cc | 11 +++++++++++
> > gcc/doc/tm.texi | 8 ++++++++
> > gcc/doc/tm.texi.in | 2 ++
> > gcc/ira-color.cc | 3 +--
> > gcc/target.def | 12 ++++++++++++
> > gcc/targhooks.cc | 8 ++++++++
> > gcc/targhooks.h | 1 +
> > 7 files changed, 43 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index f89201684a8..3128973ba79 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> > return false;
> > }
> >
> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > +
> > +static int
> > +ix86_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return 1;
> > +}
> > +
> > /* Return true if a set of DST by the expression SRC should be allowed.
> > This prevents complex sets of likely_spilled hard regs before split1. */
> >
> > @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> > #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> > #undef TARGET_CLASS_LIKELY_SPILLED_P
> > #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> > +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> > + ix86_ira_callee_saved_register_cost_scale
> >
> > #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> > #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 0de24eda6f0..9f42913a4ef 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> > The default version of this target hook always returns given class.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> > +A target hook which returns the callee-saved register @var{hard_regno}
> > +cost scale in epilogue and prologue used by IRA.
> > +
> > +The default version of this target hook returns 1 if optimizing for
> > +size, otherwise returns the entry block frequency.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> > A target hook which returns true if we use LRA instead of reload pass.
> >
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index 631d04131e3..6dbe22581ca 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -2388,6 +2388,8 @@ in the reload pass.
> >
> > @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> >
> > +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +
> > @hook TARGET_LRA_P
> >
> > @hook TARGET_REGISTER_PRIORITY
> > diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> > index 0699b349a1a..233060e1587 100644
> > --- a/gcc/ira-color.cc
> > +++ b/gcc/ira-color.cc
> > @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> > + ira_memory_move_cost[mode][rclass][1])
> > * saved_nregs / hard_regno_nregs (hard_regno,
> > mode) - 1)
> > - * (optimize_size ? 1 :
> > - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> > cost += add_cost;
> > full_cost += add_cost;
> > }
> > diff --git a/gcc/target.def b/gcc/target.def
> > index 4050b2ebdd4..c348b15815a 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -5714,6 +5714,18 @@ DEFHOOK
> > reg_class_t, (int, reg_class_t, reg_class_t),
> > default_ira_change_pseudo_allocno_class)
> >
> > +/* Scale of callee-saved register cost in epilogue and prologue used by
> > + IRA. */
> > +DEFHOOK
> > +(ira_callee_saved_register_cost_scale,
> > + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> > +cost scale in epilogue and prologue used by IRA.\n\
> > +\n\
> > +The default version of this target hook returns 1 if optimizing for\n\
> > +size, otherwise returns the entry block frequency.",
> > + int, (int hard_regno),
> > + default_ira_callee_saved_register_cost_scale)
> > +
> > /* Return true if we use LRA instead of reload. */
> > DEFHOOK
> > (lra_p,
> > diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> > index f80dc8b2e7e..344075efa41 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> > return cl;
> > }
> >
> > +int
> > +default_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return (optimize_size
> > + ? 1
>
> Why use optimize_size when REG_FREQ_FROM_BB uses
My patch just copies what
commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
Date: Tue Jun 25 08:37:49 2024 -0500
ira: Scale save/restore costs of callee save registers with block frequency
does.
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ())
>
> ? You are changing behavior for all targets this way? Did you consider
> altering REG_FREQ_FROM_BB to
>
> #define REG_FREQ_FROM_BB(bb, size_value)
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ()) \
> ? default \
> : ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> ? ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> : 1)
>
> so return a specified value for the optimize_size case? (see the
> other thread where
> I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
>
> IMO at this point a new target hook should preserve existing behavior by default
> or alternatively the original patch should be reverted as causing
> regressions and
> a new patch introducing the target hook should be installed in next stage1.
I believe the original patch should be reverted. Then my patch isn't needed.
>
> Richard.
>
> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > +}
> > +
> > extern bool
> > default_lra_p (void)
> > {
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> > index 7d15f632c23..8871e01430c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> > extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> > extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> > reg_class_t);
> > +extern int default_ira_callee_saved_register_cost_scale (int);
> > extern bool default_lra_p (void);
> > extern int default_register_priority (int);
> > extern bool default_register_usage_leveling_p (void);
> > --
> > 2.48.1
> >
On Mon, Feb 3, 2025 at 10:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Feb 3, 2025 at 5:27 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> > > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > > Date: Tue Jun 25 08:37:49 2024 -0500
> > >
> > > ira: Scale save/restore costs of callee save registers with block frequency
> > >
> > > scales the cost of saving/restoring a callee-save hard register in epilogue
> > > and prologue with the entry block frequency, which, if not optimizing for
> > > size, is 10000, for all targets. As the result, callee-saved registers
> > > may not be used to preserve local variable values across calls on some
> > > targets, like x86. Add a target hook for the callee-saved register cost
> > > scale in epilogue and prologue used by IRA. The default version of this
> > > target hook returns 1 if optimizing for size, otherwise returns the entry
> > > block frequency. Add an x86 version of this target hook to restore the
> > > old behavior prior to the above commit.
> > >
> > > PR rtl-optimization/111673
> > > PR rtl-optimization/115932
> > > PR rtl-optimization/116028
> > > PR rtl-optimization/117081
> > > PR rtl-optimization/117082
> > > PR rtl-optimization/118497
> > > * ira-color.cc (assign_hard_reg): Call the target hook for the
> > > callee-saved register cost scale in epilogue and prologue.
> > > * target.def (ira_callee_saved_register_cost_scale): New target
> > > hook.
> > > * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> > > New.
> > > * targhooks.h (default_ira_callee_saved_register_cost_scale):
> > > Likewise.
> > > * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> > > New.
> > > (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> > > * doc/tm.texi: Regenerated.
> > > * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> > > New.
> > >
> > > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > > gcc/config/i386/i386.cc | 11 +++++++++++
> > > gcc/doc/tm.texi | 8 ++++++++
> > > gcc/doc/tm.texi.in | 2 ++
> > > gcc/ira-color.cc | 3 +--
> > > gcc/target.def | 12 ++++++++++++
> > > gcc/targhooks.cc | 8 ++++++++
> > > gcc/targhooks.h | 1 +
> > > 7 files changed, 43 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index f89201684a8..3128973ba79 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> > > return false;
> > > }
> > >
> > > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > > +
> > > +static int
> > > +ix86_ira_callee_saved_register_cost_scale (int)
> > > +{
> > > + return 1;
> > > +}
> > > +
> > > /* Return true if a set of DST by the expression SRC should be allowed.
> > > This prevents complex sets of likely_spilled hard regs before split1. */
> > >
> > > @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> > > #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> > > #undef TARGET_CLASS_LIKELY_SPILLED_P
> > > #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> > > +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > > +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> > > + ix86_ira_callee_saved_register_cost_scale
> > >
> > > #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> > > #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > > index 0de24eda6f0..9f42913a4ef 100644
> > > --- a/gcc/doc/tm.texi
> > > +++ b/gcc/doc/tm.texi
> > > @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> > > The default version of this target hook always returns given class.
> > > @end deftypefn
> > >
> > > +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> > > +A target hook which returns the callee-saved register @var{hard_regno}
> > > +cost scale in epilogue and prologue used by IRA.
> > > +
> > > +The default version of this target hook returns 1 if optimizing for
> > > +size, otherwise returns the entry block frequency.
> > > +@end deftypefn
> > > +
> > > @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> > > A target hook which returns true if we use LRA instead of reload pass.
> > >
> > > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > > index 631d04131e3..6dbe22581ca 100644
> > > --- a/gcc/doc/tm.texi.in
> > > +++ b/gcc/doc/tm.texi.in
> > > @@ -2388,6 +2388,8 @@ in the reload pass.
> > >
> > > @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > >
> > > +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > > +
> > > @hook TARGET_LRA_P
> > >
> > > @hook TARGET_REGISTER_PRIORITY
> > > diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> > > index 0699b349a1a..233060e1587 100644
> > > --- a/gcc/ira-color.cc
> > > +++ b/gcc/ira-color.cc
> > > @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> > > + ira_memory_move_cost[mode][rclass][1])
> > > * saved_nregs / hard_regno_nregs (hard_regno,
> > > mode) - 1)
> > > - * (optimize_size ? 1 :
> > > - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > > + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> > > cost += add_cost;
> > > full_cost += add_cost;
> > > }
> > > diff --git a/gcc/target.def b/gcc/target.def
> > > index 4050b2ebdd4..c348b15815a 100644
> > > --- a/gcc/target.def
> > > +++ b/gcc/target.def
> > > @@ -5714,6 +5714,18 @@ DEFHOOK
> > > reg_class_t, (int, reg_class_t, reg_class_t),
> > > default_ira_change_pseudo_allocno_class)
> > >
> > > +/* Scale of callee-saved register cost in epilogue and prologue used by
> > > + IRA. */
> > > +DEFHOOK
> > > +(ira_callee_saved_register_cost_scale,
> > > + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> > > +cost scale in epilogue and prologue used by IRA.\n\
> > > +\n\
> > > +The default version of this target hook returns 1 if optimizing for\n\
> > > +size, otherwise returns the entry block frequency.",
> > > + int, (int hard_regno),
> > > + default_ira_callee_saved_register_cost_scale)
> > > +
> > > /* Return true if we use LRA instead of reload. */
> > > DEFHOOK
> > > (lra_p,
> > > diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> > > index f80dc8b2e7e..344075efa41 100644
> > > --- a/gcc/targhooks.cc
> > > +++ b/gcc/targhooks.cc
> > > @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> > > return cl;
> > > }
> > >
> > > +int
> > > +default_ira_callee_saved_register_cost_scale (int)
> > > +{
> > > + return (optimize_size
> > > + ? 1
> >
> > Why use optimize_size when REG_FREQ_FROM_BB uses
>
> My patch just copies what
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> does.
>
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ())
> >
> > ? You are changing behavior for all targets this way? Did you consider
> > altering REG_FREQ_FROM_BB to
> >
> > #define REG_FREQ_FROM_BB(bb, size_value)
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ()) \
> > ? default \
> > : ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > ? ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > : 1)
> >
> > so return a specified value for the optimize_size case? (see the
> > other thread where
> > I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
> >
> > IMO at this point a new target hook should preserve existing behavior by default
> > or alternatively the original patch should be reverted as causing
> > regressions and
> > a new patch introducing the target hook should be installed in next stage1.
>
> I believe the original patch should be reverted. Then my patch isn't needed.
I'm OK with that, but it's not my call. I do wonder why the contributor did not
address any of the fallout. Maybe he's gone? Peter?
Thanks,
Richard.
>
> >
> > Richard.
> >
> > > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > > +}
> > > +
> > > extern bool
> > > default_lra_p (void)
> > > {
> > > diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> > > index 7d15f632c23..8871e01430c 100644
> > > --- a/gcc/targhooks.h
> > > +++ b/gcc/targhooks.h
> > > @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> > > extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> > > extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> > > reg_class_t);
> > > +extern int default_ira_callee_saved_register_cost_scale (int);
> > > extern bool default_lra_p (void);
> > > extern int default_register_priority (int);
> > > extern bool default_register_usage_leveling_p (void);
> > > --
> > > 2.48.1
> > >
>
>
>
> --
> H.J.
Richard Biener <richard.guenther@gmail.com> writes:
> On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
>> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
>> Date: Tue Jun 25 08:37:49 2024 -0500
>>
>> ira: Scale save/restore costs of callee save registers with block frequency
>>
>> scales the cost of saving/restoring a callee-save hard register in epilogue
>> and prologue with the entry block frequency, which, if not optimizing for
>> size, is 10000, for all targets. As the result, callee-saved registers
>> may not be used to preserve local variable values across calls on some
>> targets, like x86. Add a target hook for the callee-saved register cost
>> scale in epilogue and prologue used by IRA. The default version of this
>> target hook returns 1 if optimizing for size, otherwise returns the entry
>> block frequency. Add an x86 version of this target hook to restore the
>> old behavior prior to the above commit.
>>
>> PR rtl-optimization/111673
>> PR rtl-optimization/115932
>> PR rtl-optimization/116028
>> PR rtl-optimization/117081
>> PR rtl-optimization/117082
>> PR rtl-optimization/118497
>> * ira-color.cc (assign_hard_reg): Call the target hook for the
>> callee-saved register cost scale in epilogue and prologue.
>> * target.def (ira_callee_saved_register_cost_scale): New target
>> hook.
>> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
>> New.
>> * targhooks.h (default_ira_callee_saved_register_cost_scale):
>> Likewise.
>> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
>> New.
>> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
>> * doc/tm.texi: Regenerated.
>> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
>> New.
>>
>> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
>> ---
>> gcc/config/i386/i386.cc | 11 +++++++++++
>> gcc/doc/tm.texi | 8 ++++++++
>> gcc/doc/tm.texi.in | 2 ++
>> gcc/ira-color.cc | 3 +--
>> gcc/target.def | 12 ++++++++++++
>> gcc/targhooks.cc | 8 ++++++++
>> gcc/targhooks.h | 1 +
>> 7 files changed, 43 insertions(+), 2 deletions(-)
>>
>> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
>> index f89201684a8..3128973ba79 100644
>> --- a/gcc/config/i386/i386.cc
>> +++ b/gcc/config/i386/i386.cc
>> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
>> return false;
>> }
>>
>> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
>> +
>> +static int
>> +ix86_ira_callee_saved_register_cost_scale (int)
>> +{
>> + return 1;
>> +}
>> +
>> /* Return true if a set of DST by the expression SRC should be allowed.
>> This prevents complex sets of likely_spilled hard regs before split1. */
>>
>> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
>> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
>> #undef TARGET_CLASS_LIKELY_SPILLED_P
>> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
>> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
>> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
>> + ix86_ira_callee_saved_register_cost_scale
>>
>> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
>> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
>> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
>> index 0de24eda6f0..9f42913a4ef 100644
>> --- a/gcc/doc/tm.texi
>> +++ b/gcc/doc/tm.texi
>> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
>> The default version of this target hook always returns given class.
>> @end deftypefn
>>
>> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
>> +A target hook which returns the callee-saved register @var{hard_regno}
>> +cost scale in epilogue and prologue used by IRA.
>> +
>> +The default version of this target hook returns 1 if optimizing for
>> +size, otherwise returns the entry block frequency.
>> +@end deftypefn
>> +
>> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
>> A target hook which returns true if we use LRA instead of reload pass.
>>
>> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
>> index 631d04131e3..6dbe22581ca 100644
>> --- a/gcc/doc/tm.texi.in
>> +++ b/gcc/doc/tm.texi.in
>> @@ -2388,6 +2388,8 @@ in the reload pass.
>>
>> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>>
>> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
>> +
>> @hook TARGET_LRA_P
>>
>> @hook TARGET_REGISTER_PRIORITY
>> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
>> index 0699b349a1a..233060e1587 100644
>> --- a/gcc/ira-color.cc
>> +++ b/gcc/ira-color.cc
>> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
>> + ira_memory_move_cost[mode][rclass][1])
>> * saved_nregs / hard_regno_nregs (hard_regno,
>> mode) - 1)
>> - * (optimize_size ? 1 :
>> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
>> cost += add_cost;
>> full_cost += add_cost;
>> }
>> diff --git a/gcc/target.def b/gcc/target.def
>> index 4050b2ebdd4..c348b15815a 100644
>> --- a/gcc/target.def
>> +++ b/gcc/target.def
>> @@ -5714,6 +5714,18 @@ DEFHOOK
>> reg_class_t, (int, reg_class_t, reg_class_t),
>> default_ira_change_pseudo_allocno_class)
>>
>> +/* Scale of callee-saved register cost in epilogue and prologue used by
>> + IRA. */
>> +DEFHOOK
>> +(ira_callee_saved_register_cost_scale,
>> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
>> +cost scale in epilogue and prologue used by IRA.\n\
>> +\n\
>> +The default version of this target hook returns 1 if optimizing for\n\
>> +size, otherwise returns the entry block frequency.",
>> + int, (int hard_regno),
>> + default_ira_callee_saved_register_cost_scale)
>> +
>> /* Return true if we use LRA instead of reload. */
>> DEFHOOK
>> (lra_p,
>> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
>> index f80dc8b2e7e..344075efa41 100644
>> --- a/gcc/targhooks.cc
>> +++ b/gcc/targhooks.cc
>> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
>> return cl;
>> }
>>
>> +int
>> +default_ira_callee_saved_register_cost_scale (int)
>> +{
>> + return (optimize_size
>> + ? 1
>
> Why use optimize_size when REG_FREQ_FROM_BB uses
>
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ())
>
> ? You are changing behavior for all targets this way? Did you consider
> altering REG_FREQ_FROM_BB to
>
> #define REG_FREQ_FROM_BB(bb, size_value)
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ()) \
> ? default \
> : ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> ? ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> : 1)
>
> so return a specified value for the optimize_size case? (see the
> other thread where
> I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
>
> IMO at this point a new target hook should preserve existing behavior by default
> or alternatively the original patch should be reverted as causing
> regressions and
> a new patch introducing the target hook should be installed in next stage1.
I don't think we should add a new target hook unless it's providing
genuinely new information about the target. Hooking into the RA to
brute-force a particular heuristic makes it harder to improve the RA
in future.
There are already hooks that provide the costs of the relevant operations,
so I think we should concentrate on using those to get good results for
both Power and x86.
Richard
On Mon, Feb 3, 2025 at 6:29 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> >> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> >> Date: Tue Jun 25 08:37:49 2024 -0500
> >>
> >> ira: Scale save/restore costs of callee save registers with block frequency
> >>
> >> scales the cost of saving/restoring a callee-save hard register in epilogue
> >> and prologue with the entry block frequency, which, if not optimizing for
> >> size, is 10000, for all targets. As the result, callee-saved registers
> >> may not be used to preserve local variable values across calls on some
> >> targets, like x86. Add a target hook for the callee-saved register cost
> >> scale in epilogue and prologue used by IRA. The default version of this
> >> target hook returns 1 if optimizing for size, otherwise returns the entry
> >> block frequency. Add an x86 version of this target hook to restore the
> >> old behavior prior to the above commit.
> >>
> >> PR rtl-optimization/111673
> >> PR rtl-optimization/115932
> >> PR rtl-optimization/116028
> >> PR rtl-optimization/117081
> >> PR rtl-optimization/117082
> >> PR rtl-optimization/118497
> >> * ira-color.cc (assign_hard_reg): Call the target hook for the
> >> callee-saved register cost scale in epilogue and prologue.
> >> * target.def (ira_callee_saved_register_cost_scale): New target
> >> hook.
> >> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> >> New.
> >> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> >> Likewise.
> >> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> >> New.
> >> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> >> * doc/tm.texi: Regenerated.
> >> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> >> New.
> >>
> >> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> >> ---
> >> gcc/config/i386/i386.cc | 11 +++++++++++
> >> gcc/doc/tm.texi | 8 ++++++++
> >> gcc/doc/tm.texi.in | 2 ++
> >> gcc/ira-color.cc | 3 +--
> >> gcc/target.def | 12 ++++++++++++
> >> gcc/targhooks.cc | 8 ++++++++
> >> gcc/targhooks.h | 1 +
> >> 7 files changed, 43 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> >> index f89201684a8..3128973ba79 100644
> >> --- a/gcc/config/i386/i386.cc
> >> +++ b/gcc/config/i386/i386.cc
> >> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> >> return false;
> >> }
> >>
> >> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> >> +
> >> +static int
> >> +ix86_ira_callee_saved_register_cost_scale (int)
> >> +{
> >> + return 1;
> >> +}
> >> +
> >> /* Return true if a set of DST by the expression SRC should be allowed.
> >> This prevents complex sets of likely_spilled hard regs before split1. */
> >>
> >> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> >> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> >> #undef TARGET_CLASS_LIKELY_SPILLED_P
> >> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> >> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> >> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> >> + ix86_ira_callee_saved_register_cost_scale
> >>
> >> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> >> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> >> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> >> index 0de24eda6f0..9f42913a4ef 100644
> >> --- a/gcc/doc/tm.texi
> >> +++ b/gcc/doc/tm.texi
> >> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> >> The default version of this target hook always returns given class.
> >> @end deftypefn
> >>
> >> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> >> +A target hook which returns the callee-saved register @var{hard_regno}
> >> +cost scale in epilogue and prologue used by IRA.
> >> +
> >> +The default version of this target hook returns 1 if optimizing for
> >> +size, otherwise returns the entry block frequency.
> >> +@end deftypefn
> >> +
> >> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> >> A target hook which returns true if we use LRA instead of reload pass.
> >>
> >> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> >> index 631d04131e3..6dbe22581ca 100644
> >> --- a/gcc/doc/tm.texi.in
> >> +++ b/gcc/doc/tm.texi.in
> >> @@ -2388,6 +2388,8 @@ in the reload pass.
> >>
> >> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> >>
> >> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> >> +
> >> @hook TARGET_LRA_P
> >>
> >> @hook TARGET_REGISTER_PRIORITY
> >> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> >> index 0699b349a1a..233060e1587 100644
> >> --- a/gcc/ira-color.cc
> >> +++ b/gcc/ira-color.cc
> >> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> >> + ira_memory_move_cost[mode][rclass][1])
> >> * saved_nregs / hard_regno_nregs (hard_regno,
> >> mode) - 1)
> >> - * (optimize_size ? 1 :
> >> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> >> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> >> cost += add_cost;
> >> full_cost += add_cost;
> >> }
> >> diff --git a/gcc/target.def b/gcc/target.def
> >> index 4050b2ebdd4..c348b15815a 100644
> >> --- a/gcc/target.def
> >> +++ b/gcc/target.def
> >> @@ -5714,6 +5714,18 @@ DEFHOOK
> >> reg_class_t, (int, reg_class_t, reg_class_t),
> >> default_ira_change_pseudo_allocno_class)
> >>
> >> +/* Scale of callee-saved register cost in epilogue and prologue used by
> >> + IRA. */
> >> +DEFHOOK
> >> +(ira_callee_saved_register_cost_scale,
> >> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> >> +cost scale in epilogue and prologue used by IRA.\n\
> >> +\n\
> >> +The default version of this target hook returns 1 if optimizing for\n\
> >> +size, otherwise returns the entry block frequency.",
> >> + int, (int hard_regno),
> >> + default_ira_callee_saved_register_cost_scale)
> >> +
> >> /* Return true if we use LRA instead of reload. */
> >> DEFHOOK
> >> (lra_p,
> >> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> >> index f80dc8b2e7e..344075efa41 100644
> >> --- a/gcc/targhooks.cc
> >> +++ b/gcc/targhooks.cc
> >> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> >> return cl;
> >> }
> >>
> >> +int
> >> +default_ira_callee_saved_register_cost_scale (int)
> >> +{
> >> + return (optimize_size
> >> + ? 1
> >
> > Why use optimize_size when REG_FREQ_FROM_BB uses
> >
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ())
> >
> > ? You are changing behavior for all targets this way? Did you consider
> > altering REG_FREQ_FROM_BB to
> >
> > #define REG_FREQ_FROM_BB(bb, size_value)
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ()) \
> > ? default \
> > : ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > ? ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > : 1)
> >
> > so return a specified value for the optimize_size case? (see the
> > other thread where
> > I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
> >
> > IMO at this point a new target hook should preserve existing behavior by default
> > or alternatively the original patch should be reverted as causing
> > regressions and
> > a new patch introducing the target hook should be installed in next stage1.
>
> I don't think we should add a new target hook unless it's providing
> genuinely new information about the target. Hooking into the RA to
> brute-force a particular heuristic makes it harder to improve the RA
> in future.
>
> There are already hooks that provide the costs of the relevant operations,
> so I think we should concentrate on using those to get good results for
> both Power and x86.
It isn't just about Power and x86.
commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
Date: Tue Jun 25 08:37:49 2024 -0500
ira: Scale save/restore costs of callee save registers with block frequency
caused regressions on many targets, including aarch64:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116028
I don't understand why frequency can be used to scale the
cost.
On 2/3/25 2:31 AM, H.J. Lu wrote:
>>
>> IMO at this point a new target hook should preserve existing behavior by default
>> or alternatively the original patch should be reverted as causing
>> regressions and
>> a new patch introducing the target hook should be installed in next stage1.
>
> I believe the original patch should be reverted. Then my patch isn't needed.
That patch had significant improvements across the board for RISC-V. I
wouldn't want to see it reverted without a strong explanation of why it
was wrong.
jeff
> > I don't think we should add a new target hook unless it's providing
> > genuinely new information about the target. Hooking into the RA to
> > brute-force a particular heuristic makes it harder to improve the RA
> > in future.
> >
> > There are already hooks that provide the costs of the relevant operations,
> > so I think we should concentrate on using those to get good results for
> > both Power and x86.
>
> It isn't just about Power and x86.
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> caused regressions on many targets, including aarch64:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116028
>
> I don't understand why frequency can be used to scale the
> cost.
Adding such absolute hook does not seem to make sense to me, expecially
if the problem happens on multiple targets.
Let me see if I understnad what is going on. Checking one PR the patch
broke testcase:
void foo (void);
void bar (void);
int
test (int a)
{
int r;
if (r = -a)
foo ();
else
bar ();
return r;
}
Here the cost model should decide whether extra spill in
prologe/epilogue is chaper than extra spill around calls to foo() and
bar().
REG_FREQ_FROM_BB should return values in range 1....REG_FREQ_MAX which
expresses relative frequencies of BBs compressed to 0...REG_FREQ_MAX.
Avoiding 0 is to prevent cost model from thinking that spilling is
completely free.
In the case above we should have entry_block_freq = REG_FREQ_MAX (so the
epilogue/prologue cost is scaled by 1000) and frequencies of foo/bar to
have something like REG_FREQ_MAX * p and REG_FREQ_MAX * (1-p) where p is
the probability of the conditional.
So I guess IRA ends up comparing
- spill_cost * REG_FREQ_MAX for using callee saved register
- spill_cost * (p + 1-p) * REG_FREQ_MAX for using caller saved register.
Which correctly represnets that in both case we will end up executing
same amount of spill code.
If we want it to choose variant with fewer static spill count, we could
add 1 to the final costs for every spill instruction which will biass
the cost model that way...
What would make sense would be to express to IRA that moves used in
prologue/epilogue are slightly chaper on x86 then reuglar moves, since we can use
push/pop pair which encode shorter than stack frame adjustment + regular
store/load used to spill around call.
Honza
>
> --
> H.J.
Hello,
On Mon, 3 Feb 2025, H.J. Lu wrote:
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets.
This merely represents the fact that the entry block is indeed entered
exactly once per function invocation, i.e. 1.0 in fixed point with a scale
of 1000. All costs in ira are (supposed to be) scaled by bb-frequency of
the allocno/register occurence, and hence this add_cost to cater for
xlogue-save/restore needs to be scaled by that as well, which is what
Suryas patch was adding.
Any fallout from that needs to be addressed on top of that, not by
reverting it, or by introducing a hook to avoid that. Think of this scale
as an arbitrary value to implement pseudo-fixed-point arithmetic for
costs. All values need to be scaled by it. That its value is a seemingly
large number of 1000 is not the worry, it represents 1.0 .
If the issue is for instance that callee-saved registers aren't used
because the prologue save/restore is now deemed too expensive relative to
the around-call-save-restore when a call-clobbered register is used, then
either the around-call-save-restore instructions aren't correctly costed
(perhaps also missing the scale factor?), or because ties aren't broken
nicely, in which case adding a 1 at one or the other place might be
needed.
Ciao,
Michael.
On 2/3/25 3:44 AM, Richard Biener wrote:
> On Mon, Feb 3, 2025 at 10:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>> I believe the original patch should be reverted. Then my patch isn't needed.
>
> I'm OK with that, but it's not my call. I do wonder why the contributor did not
> address any of the fallout. Maybe he's gone? Peter?
Surya (she) is working on the fallout. In fact, one patch earlier this year
was committed and reverted due to some aarch64 fallout. That said, Andrew
mentioned on IRC that he was interested in getting that patch back in for aarch64
because it helps shrink-wrapping and he believes the patch itself wasn't bad,
but exposed a latent issue that was causing the bootstrap issue on aarch64.
Surya also just recently submitted another patch to help with the original fallout:
[PATCH] lra: initialize allocated_hard_reg_p[] for hard regs referenced in RTL [PR118533]
...which you commented on. She is working on them.
I disagree with H.J.'s comment. I have said before at the Cauldron and in
some bugzilla's, that Surya's fix is a correct fix. The issues encountered
here seem to be latent issues exposed by Surya's fix (read also Matz's reply)
and as such, this patch should stay. The correct path here is to track down
those latent issues and fix those. I've asked Surya to continue to work on
the fallout, but any help from other's is greatly appreciated!
Reverting now would also cause performance regressions on Power, RISC-V and ARM.
Peter
On 2/3/25 7:14 AM, Jeff Law wrote:
> On 2/3/25 2:31 AM, H.J. Lu wrote:
>> I believe the original patch should be reverted. Then my patch isn't needed.
>
> That patch had significant improvements across the board for RISC-V.
> I wouldn't want to see it reverted without a strong explanation of why it was wrong.
In my opinion, the patch is not wrong, but rather has exposed latent issues
that need to be worked on and fixed. Ive asked Surya to continue working on
the fallout (see her other patches), but help from others is always appreciated.
Peter
> Hello,
>
> On Mon, 3 Feb 2025, H.J. Lu wrote:
>
> > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > Date: Tue Jun 25 08:37:49 2024 -0500
> >
> > ira: Scale save/restore costs of callee save registers with block frequency
> >
> > scales the cost of saving/restoring a callee-save hard register in epilogue
> > and prologue with the entry block frequency, which, if not optimizing for
> > size, is 10000, for all targets.
>
> This merely represents the fact that the entry block is indeed entered
> exactly once per function invocation, i.e. 1.0 in fixed point with a scale
> of 1000. All costs in ira are (supposed to be) scaled by bb-frequency of
> the allocno/register occurence, and hence this add_cost to cater for
> xlogue-save/restore needs to be scaled by that as well, which is what
> Suryas patch was adding.
This is nice summary. One correction is that REG_FREQ_MAX should
represent maximal frequency in function (i.e. innermost loop if it
exists). So entry bock has REG_FREQ_MAX only in functions with no BBs
that execute with frequency >1.
Freq 1 is frequency of entry block (which is correctly used by the
patch).
>
> Any fallout from that needs to be addressed on top of that, not by
> reverting it, or by introducing a hook to avoid that. Think of this scale
> as an arbitrary value to implement pseudo-fixed-point arithmetic for
> costs. All values need to be scaled by it. That its value is a seemingly
> large number of 1000 is not the worry, it represents 1.0 .
>
> If the issue is for instance that callee-saved registers aren't used
> because the prologue save/restore is now deemed too expensive relative to
> the around-call-save-restore when a call-clobbered register is used, then
> either the around-call-save-restore instructions aren't correctly costed
> (perhaps also missing the scale factor?), or because ties aren't broken
> nicely, in which case adding a 1 at one or the other place might be
> needed.
In the testcase I quoted from one of PRs the costs was simply
(dynamically) the same. there were two function calls with combine
frequency 1. We may want to add logic representing that static
instruction counts matters too, if the dynamic costs are the same which
indeed can be done by adding 1.
Other thing is that push/pop is shorter than mov which is target
specific knowledge.
Honza
>
>
> Ciao,
> Michael.
On 2/3/25 1:20 AM, H.J. Lu wrote:
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets. As the result, callee-saved registers
> may not be used to preserve local variable values across calls on some
> targets, like x86. Add a target hook for the callee-saved register cost
> scale in epilogue and prologue used by IRA. The default version of this
> target hook returns 1 if optimizing for size, otherwise returns the entry
> block frequency. Add an x86 version of this target hook to restore the
> old behavior prior to the above commit.
This is a complicated problem resulted in many tries to fix it in some
general way.
In general I am agree with Surya's approach to scale cost of reg
saves/restores somehow. But the general approach, although solved some
problems, also created a lot of new ones. May be because IRA does not
take some other aspects of using callee saved regs. And some of them
were addressed by other patches. e.g. recently proposed by Surya and one
for PR118497.
I also agree with Richard Sandiford's comment that we should avoid
introducing the new hooks for RA and I actually tried to stick to this
policy for a long time. But I don't see another solution to introducing
the new hook in this case. It is hard to figure out generally in RA
that saves/restores will be insns different from ld/st (e.g. x86
push/pop) and that they will be cheaper.
So after some time to think about the patch I decided to approve the RA
part of the patch. I also hope that the work on this problem will
continue (e.g. improving default and target hook implementations and
documentation how to better use it).
We have a lot of tests checking expected code generation (some of them
are wrong, e.g. expecting particular hard regs). But those are small
tests. We should pay more attention to check impact on SPEC for such
patches. It is more time-consuming but it is the right way. A patch can
introduce worse code generation for small test but can be a win for real
applications as ones in SPEC. Any RA in industrial compiler can not
generate optimal code for most cases. So I would not mark such GCC test
failures as P1 PRs.
The RA patch is OK for me for committing it into the trunk. Thank you.
> PR rtl-optimization/111673
> PR rtl-optimization/115932
> PR rtl-optimization/116028
> PR rtl-optimization/117081
> PR rtl-optimization/117082
> PR rtl-optimization/118497
> * ira-color.cc (assign_hard_reg): Call the target hook for the
> callee-saved register cost scale in epilogue and prologue.
> * target.def (ira_callee_saved_register_cost_scale): New target
> hook.
> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> New.
> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> New.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> * doc/tm.texi: Regenerated.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> New.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386.cc | 11 +++++++++++
> gcc/doc/tm.texi | 8 ++++++++
> gcc/doc/tm.texi.in | 2 ++
> gcc/ira-color.cc | 3 +--
> gcc/target.def | 12 ++++++++++++
> gcc/targhooks.cc | 8 ++++++++
> gcc/targhooks.h | 1 +
> 7 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f89201684a8..3128973ba79 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +
> +static int
> +ix86_ira_callee_saved_register_cost_scale (int)
> +{
> + return 1;
> +}
> +
> /* Return true if a set of DST by the expression SRC should be allowed.
> This prevents complex sets of likely_spilled hard regs before split1. */
>
> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> + ix86_ira_callee_saved_register_cost_scale
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..9f42913a4ef 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> The default version of this target hook always returns given class.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> +A target hook which returns the callee-saved register @var{hard_regno}
> +cost scale in epilogue and prologue used by IRA.
> +
> +The default version of this target hook returns 1 if optimizing for
> +size, otherwise returns the entry block frequency.
> +@end deftypefn
> +
> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> A target hook which returns true if we use LRA instead of reload pass.
>
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..6dbe22581ca 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -2388,6 +2388,8 @@ in the reload pass.
>
> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>
> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +
> @hook TARGET_LRA_P
>
> @hook TARGET_REGISTER_PRIORITY
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..233060e1587 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> cost += add_cost;
> full_cost += add_cost;
> }
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..c348b15815a 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -5714,6 +5714,18 @@ DEFHOOK
> reg_class_t, (int, reg_class_t, reg_class_t),
> default_ira_change_pseudo_allocno_class)
>
> +/* Scale of callee-saved register cost in epilogue and prologue used by
> + IRA. */
> +DEFHOOK
> +(ira_callee_saved_register_cost_scale,
> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> +cost scale in epilogue and prologue used by IRA.\n\
> +\n\
> +The default version of this target hook returns 1 if optimizing for\n\
> +size, otherwise returns the entry block frequency.",
> + int, (int hard_regno),
> + default_ira_callee_saved_register_cost_scale)
> +
> /* Return true if we use LRA instead of reload. */
> DEFHOOK
> (lra_p,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..344075efa41 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> return cl;
> }
>
> +int
> +default_ira_callee_saved_register_cost_scale (int)
> +{
> + return (optimize_size
> + ? 1
> + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +}
> +
> extern bool
> default_lra_p (void)
> {
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..8871e01430c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> reg_class_t);
> +extern int default_ira_callee_saved_register_cost_scale (int);
> extern bool default_lra_p (void);
> extern int default_register_priority (int);
> extern bool default_register_usage_leveling_p (void);
Vladimir Makarov <vmakarov@redhat.com> writes:
> On 2/3/25 1:20 AM, H.J. Lu wrote:
>> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
>> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
>> Date: Tue Jun 25 08:37:49 2024 -0500
>>
>> ira: Scale save/restore costs of callee save registers with block frequency
>>
>> scales the cost of saving/restoring a callee-save hard register in epilogue
>> and prologue with the entry block frequency, which, if not optimizing for
>> size, is 10000, for all targets. As the result, callee-saved registers
>> may not be used to preserve local variable values across calls on some
>> targets, like x86. Add a target hook for the callee-saved register cost
>> scale in epilogue and prologue used by IRA. The default version of this
>> target hook returns 1 if optimizing for size, otherwise returns the entry
>> block frequency. Add an x86 version of this target hook to restore the
>> old behavior prior to the above commit.
>
> This is a complicated problem resulted in many tries to fix it in some
> general way.
>
> In general I am agree with Surya's approach to scale cost of reg
> saves/restores somehow. But the general approach, although solved some
> problems, also created a lot of new ones. May be because IRA does not
> take some other aspects of using callee saved regs. And some of them
> were addressed by other patches. e.g. recently proposed by Surya and one
> for PR118497.
>
> I also agree with Richard Sandiford's comment that we should avoid
> introducing the new hooks for RA and I actually tried to stick to this
> policy for a long time. But I don't see another solution to introducing
> the new hook in this case. It is hard to figure out generally in RA
> that saves/restores will be insns different from ld/st (e.g. x86
> push/pop) and that they will be cheaper.
>
> So after some time to think about the patch I decided to approve the RA
> part of the patch. I also hope that the work on this problem will
> continue (e.g. improving default and target hook implementations and
> documentation how to better use it).
For the record, I strongly object to this. The hook just seems like a
complete hack to me. Even if we accept that there is target-specific
information in play, the hook isn't providing that information.
In contrast to what you said above, my objection isn't to having hooks
-- those are often needed and good. What bothers me is that the hook
isn't well designed. Hooks should provide information rather than
override code by brute force.
On the other hand, I accept that you're (rightly!) the maintainer.
Thanks,
Richard
> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > +
> > +static int
> > +ix86_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return 1;
> > +}
> > +
> > return cl;
> > }
> > +int
> > +default_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return (optimize_size
> > + ? 1
> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > +}
> > +
I am not sure how this makes sense - why x86 would be significantly
different from other targets?
I think the only bit non-standard thing is that prologue/epilogue code
can use push/pop that is shorter than mov used to save caller saved
registers.
I went through few testcases:
void d();
void a()
{
int b;
asm ("use %0":"=r" (b));
d();
asm volatile (""::"r" (b));
}
compiler with -O2 -fira-verbose=10000 gives:
Popping a0(r99,l0) -- (0=12000,12000) (1=12000,12000) (2=12000,12000) (4=12000,12000) (5=12000,12000) (36=12000,12000) (37=12000,12000) (38=12000,12000) (39=12000,12000) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
load and save costs are 6. So spill pair is 12 weighted by 1000 that is
REG_FREQ_MAX.
Register 0 (EAX) has cost 12000 which makes sense to me:
- load and save costs are 6, combined spill pair is 12
- REG_FREQ_MAX is 1000 and since function has only one BB, it has
maximal frequency, so we get 12000.
Register 3 (first caller saved) has cost 11000. This comes from:
add_cost = ((ira_memory_move_cost[mode][rclass][0]
+ ira_memory_move_cost[mode][rclass][1])
* saved_nregs / hard_regno_nregs (hard_regno,
mode) - 1)
^^
here
* (optimize_size ? 1 :
REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
There is no comment why -1, but I suppose it is there to biass costs to
use prologue/epilogue instad of caller save sequence when runtime cost
estimate is even.
Now for
void d();
void a()
{
for (int i = 0; i < 100; i++)
d();
int b;
asm ("use %0":"=r" (b));
d();
asm volatile (""::"r" (b));
}
I get
Popping a0(r100,l0) -- (0=120,120) (1=120,120) (2=120,120) (4=120,120) (5=120,120) (36=120,120) (37=120,120) (38=120,120) (39=120,120) (3=0,0) (6=110,110) (40=110,110) (41=110,110) (42=110,110) (43=110,110)
This also makes sense to me, since there is loop the basic block has
lower frequency of 10, thus costs are scaled down.
void d();
int cnd;
void a()
{
int b;
asm ("use %0":"=r" (b));
if (__builtin_expect_with_probability (cnd, 1, 0.8))
d();
asm volatile (""::"r" (b));
}
I get
Popping a0(r100,l0) -- (0=9600,9600) (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
which seems also correct. It is better to use caller saved registr
since call to d() has lower frequency then the entry basic block. This
is what gcc 13 and this patch gets wrong
Popping a0(r100,l0) -- (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11,11) (6=11,11) (40=11,11) (41=11,11) (42=11,11) (43=11,11)
Due to missing scaling factor we think that using callee saved registr
is win while it is not. GCC13 gets this wrong even for probability 0.
Looking into PRs referneced in the patch:
PR111673 is the original bug that motivated correcting the cost (adding
the scale by entry block frequency)
PR115932 is cris-elf I don't know how to bencmark easily.
PR116028 seems to be about shrink wrapping in
void f(int *i)
{
if (!i)
return;
else
{
__builtin_printf("Hi");
*i=0;
}
}
here I see tha tthe cost model misses the fact that epilogue will be
shrink-wrapped so both caller and callee saving will result in one spill
after the early exit.
PR117081 is about regression in povray. The reducted testcase:
void foo (void);
void bar (void);
int
test (int a)
{
int r;
if (r = -a)
foo ();
else
bar ();
return r;
}
shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
saved register and produces:
0000000000000000 <test>:
0: 53 push %rbx <--- callee save
1: 89 fb mov %edi,%ebx <--- move 1
3: f7 db neg %ebx
5: 74 09 je 10 <test+0x10>
7: e8 00 00 00 00 call c <test+0xc>
c: 89 d8 mov %ebx,%eax <--- callee restore
e: 5b pop %rbx
f: c3 ret
10: e8 00 00 00 00 call 15 <test+0x15>
15: 89 d8 mov %ebx,%eax <--- move 2
17: 5b pop %rbx <--- callee restore
18: c3 ret
Mainline used EAX since it has costs 13000. It is not 100% clear to me
why.
- 12000 is the spilling (which is emitted twice but executed just once)
- I would have expected 2000 for the move from edi to eax.
However even if cost is 14000 we will choose EAX. The code is:
0: 89 f8 mov %edi,%eax <--- move1
2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
6: f7 d8 neg %eax
8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
c: 85 ff test %edi,%edi
e: 74 10 je 20 <test+0x20>
10: e8 00 00 00 00 call 15 <test+0x15>
15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
1d: c3 ret
1e: 66 90 xchg %ax,%ax
20: e8 00 00 00 00 call 25 <test+0x25>
25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
2d: c3 ret
This sequence really saves one move at the expense of of stack frame
allocation (which is not modelled by the cost model) and longer spill
code (also no modelled).
PR117082 is about noreturn function:
__attribute__ ((noreturn))
void
f3 (void)
{
int y0 = x0;
int y1 = x1;
f1 ();
f2 (y0, y1);
while (1);
}
Here the cost model is really wrong by assuming that entry and exit
block have same frequencies. This can be fixed quite easilly (though it
is a rare case)
PR118497 seems to be ixed.
So overall I think
1) we can fix scaling of epilogue by exit block frequency
to get noreturns right.
2) we should drop the check for optimize_size. Since with -Os
REG_FREQ_FROM_BB always returns 1000 everything should be scaled
same way
3) we currently have wire in "-1" to biass the cost metric for callee
saved registers.
It may make sense to allow targets to control this, since i.e. x86
has push/pop that is shorter. -3 would solve the testcase with neg
and would express that push/pop is still cheaper with extra reg-reg
move.
4) cost model misses shring wrapping, the fact that if register is
callee saved it may be used by multiple allocnos and also that
push/pop sequence may avoid need for manual RSP adjustments.
Those seems bit harder things to fit in though.
So if we want to go with the target hook, I think it should adjust the
cost before scalling (since targets may have special tricks for
prologues) rather than the scale factor (which is target independent
part of cost model).
Honza
On 2/6/25 4:54 PM, Richard Sandiford wrote:
> Vladimir Makarov<vmakarov@redhat.com> writes:
>> This is a complicated problem resulted in many tries to fix it in some
>> general way.
>>
>> In general I am agree with Surya's approach to scale cost of reg
>> saves/restores somehow. But the general approach, although solved some
>> problems, also created a lot of new ones. May be because IRA does not
>> take some other aspects of using callee saved regs. And some of them
>> were addressed by other patches. e.g. recently proposed by Surya and one
>> for PR118497.
>>
>> I also agree with Richard Sandiford's comment that we should avoid
>> introducing the new hooks for RA and I actually tried to stick to this
>> policy for a long time. But I don't see another solution to introducing
>> the new hook in this case. It is hard to figure out generally in RA
>> that saves/restores will be insns different from ld/st (e.g. x86
>> push/pop) and that they will be cheaper.
>>
>> So after some time to think about the patch I decided to approve the RA
>> part of the patch. I also hope that the work on this problem will
>> continue (e.g. improving default and target hook implementations and
>> documentation how to better use it).
> For the record, I strongly object to this. The hook just seems like a
> complete hack to me. Even if we accept that there is target-specific
> information in play, the hook isn't providing that information.
>
> In contrast to what you said above, my objection isn't to having hooks
> -- those are often needed and good. What bothers me is that the hook
> isn't well designed. Hooks should provide information rather than
> override code by brute force.
>
> On the other hand, I accept that you're (rightly!) the maintainer.
>
I also don't like the hook implementation for x86-64 (although this is a
matter of target maintainers). All these costs look voodoo and random
to me.
But this problem is longing for more than half year. I spent a lot of
time too on this. Patches were submitted and reverted and nobody did
find so far any solution satisfying all GCC tests. If somebody finds a
solution without the hook, I will be glad to get rid off the hook. Also
the related PRs are marked as P1 ones, it means people think they are
important (I am not sure about this myself). Without fixing them (or
downgrading them) there will be no GCC release. So I am in difficult
situation with these PRs and need some resolution.
On Thu, Feb 6, 2025 at 11:40 PM Vladimir Makarov <vmakarov@redhat.com> wrote:
>
>
> On 2/6/25 4:54 PM, Richard Sandiford wrote:
>
> Vladimir Makarov <vmakarov@redhat.com> writes:
>
> This is a complicated problem resulted in many tries to fix it in some
> general way.
>
> In general I am agree with Surya's approach to scale cost of reg
> saves/restores somehow. But the general approach, although solved some
> problems, also created a lot of new ones. May be because IRA does not
> take some other aspects of using callee saved regs. And some of them
> were addressed by other patches. e.g. recently proposed by Surya and one
> for PR118497.
>
> I also agree with Richard Sandiford's comment that we should avoid
> introducing the new hooks for RA and I actually tried to stick to this
> policy for a long time. But I don't see another solution to introducing
> the new hook in this case. It is hard to figure out generally in RA
> that saves/restores will be insns different from ld/st (e.g. x86
> push/pop) and that they will be cheaper.
>
> So after some time to think about the patch I decided to approve the RA
> part of the patch. I also hope that the work on this problem will
> continue (e.g. improving default and target hook implementations and
> documentation how to better use it).
>
> For the record, I strongly object to this. The hook just seems like a
> complete hack to me. Even if we accept that there is target-specific
> information in play, the hook isn't providing that information.
>
> In contrast to what you said above, my objection isn't to having hooks
> -- those are often needed and good. What bothers me is that the hook
> isn't well designed. Hooks should provide information rather than
> override code by brute force.
>
> On the other hand, I accept that you're (rightly!) the maintainer.
>
> I also don't like the hook implementation for x86-64 (although this is a matter of target maintainers). All these costs look voodoo and random to me.
>
> But this problem is longing for more than half year. I spent a lot of time too on this. Patches were submitted and reverted and nobody did find so far any solution satisfying all GCC tests. If somebody finds a solution without the hook, I will be glad to get rid off the hook. Also the related PRs are marked as P1 ones, it means people think they are important (I am not sure about this myself). Without fixing them (or downgrading them) there will be no GCC release. So I am in difficult situation with these PRs and need some resolution.
Just to chime in as the one who likely made those PRs P1. 'P1' here
is really about the testsuite FAIL,
how to resolve it is up to target maintainers - P1 should make them
look, and adjusting the testcase
(or even XFAILing it) is a valid resolution of the P1 regression.
I do agree with Richards comment on the proposed hook to be badly
designed and I do like
Honzas suggestion to instead hookize the biasing.
Richard.
>
>
> On Thu, Feb 6, 2025 at 11:40 PM Vladimir Makarov <vmakarov@redhat.com> wrote:
> >
> >
> > On 2/6/25 4:54 PM, Richard Sandiford wrote:
> >
> > Vladimir Makarov <vmakarov@redhat.com> writes:
> >
> > This is a complicated problem resulted in many tries to fix it in some
> > general way.
> >
> > In general I am agree with Surya's approach to scale cost of reg
> > saves/restores somehow. But the general approach, although solved some
> > problems, also created a lot of new ones. May be because IRA does not
> > take some other aspects of using callee saved regs. And some of them
> > were addressed by other patches. e.g. recently proposed by Surya and one
> > for PR118497.
> >
> > I also agree with Richard Sandiford's comment that we should avoid
> > introducing the new hooks for RA and I actually tried to stick to this
> > policy for a long time. But I don't see another solution to introducing
> > the new hook in this case. It is hard to figure out generally in RA
> > that saves/restores will be insns different from ld/st (e.g. x86
> > push/pop) and that they will be cheaper.
> >
> > So after some time to think about the patch I decided to approve the RA
> > part of the patch. I also hope that the work on this problem will
> > continue (e.g. improving default and target hook implementations and
> > documentation how to better use it).
> >
> > For the record, I strongly object to this. The hook just seems like a
> > complete hack to me. Even if we accept that there is target-specific
> > information in play, the hook isn't providing that information.
> >
> > In contrast to what you said above, my objection isn't to having hooks
> > -- those are often needed and good. What bothers me is that the hook
> > isn't well designed. Hooks should provide information rather than
> > override code by brute force.
> >
> > On the other hand, I accept that you're (rightly!) the maintainer.
> >
> > I also don't like the hook implementation for x86-64 (although this is a matter of target maintainers). All these costs look voodoo and random to me.
> >
> > But this problem is longing for more than half year. I spent a lot of time too on this. Patches were submitted and reverted and nobody did find so far any solution satisfying all GCC tests. If somebody finds a solution without the hook, I will be glad to get rid off the hook. Also the related PRs are marked as P1 ones, it means people think they are important (I am not sure about this myself). Without fixing them (or downgrading them) there will be no GCC release. So I am in difficult situation with these PRs and need some resolution.
>
> Just to chime in as the one who likely made those PRs P1. 'P1' here
> is really about the testsuite FAIL,
> how to resolve it is up to target maintainers - P1 should make them
> look, and adjusting the testcase
> (or even XFAILing it) is a valid resolution of the P1 regression.
I certainly understand the pain with heuirstics changes distubring
testsuite and bringing some regressions and some improvements making it
difficult to weight overall impact. With inliner heuristics this
happens often.
We are good on tracking regressions, but if something improves, it goes
without saying. If some heuristics is formely wrong and fixing it
introduces some problems, we are kind of biassed towards keeping old,
broken one. Consistency is good, but I think in this specific case we
should see overall improvements.
The testcases where we previously shrink-wrapped and now use caller
saved registers are IMO not necessarily a problem. We had bug in cost
model that made us to use callee saved registers too often and
shrink-wrapping helped to mitigate it. Using caller saved register
has same effect if frequency of caller saving sums to 1. So perhaps
adjusting the testcases to have more calls will restore original
behaviour in cases where they are intendd to verify that shrink-wrapping
works.
Naturally it would be nice to make the cost model anticipate the
shrink-wrapping but that looks like quite hard problem to me, since
precise conditions are known only post-reload. Perhaps one can special
case some early exit conditions where shrink-wrapping is most important.
I will prepare patch for noreturns. They are probably not too important
in practice but easy to get right.
Honza
Really nice analysis! Thanks for writing this up.
Sorry for the big quote below, but:
Jan Hubicka <hubicka@ucw.cz> writes:
>> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
>> > +
>> > +static int
>> > +ix86_ira_callee_saved_register_cost_scale (int)
>> > +{
>> > + return 1;
>> > +}
>> > +
>
>> > return cl;
>> > }
>> > +int
>> > +default_ira_callee_saved_register_cost_scale (int)
>> > +{
>> > + return (optimize_size
>> > + ? 1
>> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>> > +}
>> > +
>
> I am not sure how this makes sense - why x86 would be significantly
> different from other targets?
>
> I think the only bit non-standard thing is that prologue/epilogue code
> can use push/pop that is shorter than mov used to save caller saved
> registers.
>
> I went through few testcases:
>
> void d();
> void a()
> {
> int b;
> asm ("use %0":"=r" (b));
> d();
> asm volatile (""::"r" (b));
> }
> compiler with -O2 -fira-verbose=10000 gives:
>
> Popping a0(r99,l0) -- (0=12000,12000) (1=12000,12000) (2=12000,12000) (4=12000,12000) (5=12000,12000) (36=12000,12000) (37=12000,12000) (38=12000,12000) (39=12000,12000) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
>
> load and save costs are 6. So spill pair is 12 weighted by 1000 that is
> REG_FREQ_MAX.
>
> Register 0 (EAX) has cost 12000 which makes sense to me:
> - load and save costs are 6, combined spill pair is 12
> - REG_FREQ_MAX is 1000 and since function has only one BB, it has
> maximal frequency, so we get 12000.
>
> Register 3 (first caller saved) has cost 11000. This comes from:
> add_cost = ((ira_memory_move_cost[mode][rclass][0]
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> ^^
> here
>
> * (optimize_size ? 1 :
> REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>
> There is no comment why -1, but I suppose it is there to biass costs to
> use prologue/epilogue instad of caller save sequence when runtime cost
> estimate is even.
>
> Now for
> void d();
> void a()
> {
> for (int i = 0; i < 100; i++)
> d();
> int b;
> asm ("use %0":"=r" (b));
> d();
> asm volatile (""::"r" (b));
> }
>
> I get
>
> Popping a0(r100,l0) -- (0=120,120) (1=120,120) (2=120,120) (4=120,120) (5=120,120) (36=120,120) (37=120,120) (38=120,120) (39=120,120) (3=0,0) (6=110,110) (40=110,110) (41=110,110) (42=110,110) (43=110,110)
>
> This also makes sense to me, since there is loop the basic block has
> lower frequency of 10, thus costs are scaled down.
>
> void d();
> int cnd;
> void a()
> {
> int b;
> asm ("use %0":"=r" (b));
>
> if (__builtin_expect_with_probability (cnd, 1, 0.8))
> d();
> asm volatile (""::"r" (b));
> }
>
> I get
>
> Popping a0(r100,l0) -- (0=9600,9600) (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
>
> which seems also correct. It is better to use caller saved registr
> since call to d() has lower frequency then the entry basic block. This
> is what gcc 13 and this patch gets wrong
>
> Popping a0(r100,l0) -- (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11,11) (6=11,11) (40=11,11) (41=11,11) (42=11,11) (43=11,11)
>
> Due to missing scaling factor we think that using callee saved registr
> is win while it is not. GCC13 gets this wrong even for probability 0.
>
> Looking into PRs referneced in the patch:
> PR111673 is the original bug that motivated correcting the cost (adding
> the scale by entry block frequency)
> PR115932 is cris-elf I don't know how to bencmark easily.
> PR116028 seems to be about shrink wrapping in
>
> void f(int *i)
> {
> if (!i)
> return;
> else
> {
> __builtin_printf("Hi");
> *i=0;
> }
> }
>
> here I see tha tthe cost model misses the fact that epilogue will be
> shrink-wrapped so both caller and callee saving will result in one spill
> after the early exit.
>
> PR117081 is about regression in povray. The reducted testcase:
>
> void foo (void);
> void bar (void);
>
> int
> test (int a)
> {
> int r;
>
> if (r = -a)
> foo ();
> else
> bar ();
>
> return r;
> }
>
> shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
> Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
>
> here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
> multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
> saved register and produces:
>
> 0000000000000000 <test>:
> 0: 53 push %rbx <--- callee save
> 1: 89 fb mov %edi,%ebx <--- move 1
> 3: f7 db neg %ebx
> 5: 74 09 je 10 <test+0x10>
> 7: e8 00 00 00 00 call c <test+0xc>
> c: 89 d8 mov %ebx,%eax <--- callee restore
> e: 5b pop %rbx
> f: c3 ret
> 10: e8 00 00 00 00 call 15 <test+0x15>
> 15: 89 d8 mov %ebx,%eax <--- move 2
> 17: 5b pop %rbx <--- callee restore
> 18: c3 ret
>
> Mainline used EAX since it has costs 13000. It is not 100% clear to me
> why.
> - 12000 is the spilling (which is emitted twice but executed just once)
> - I would have expected 2000 for the move from edi to eax.
> However even if cost is 14000 we will choose EAX. The code is:
>
> 0: 89 f8 mov %edi,%eax <--- move1
> 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
> 6: f7 d8 neg %eax
> 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
> c: 85 ff test %edi,%edi
> e: 74 10 je 20 <test+0x20>
> 10: e8 00 00 00 00 call 15 <test+0x15>
> 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> 1d: c3 ret
> 1e: 66 90 xchg %ax,%ax
> 20: e8 00 00 00 00 call 25 <test+0x25>
> 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> 2d: c3 ret
>
> This sequence really saves one move at the expense of of stack frame
> allocation (which is not modelled by the cost model) and longer spill
> code (also no modelled).
Kind of a tangent, but is a cost of 2 reasonable for these reg<-reg moves,
or is it too high? Reducing moves seems like the wrong thing to optimise
for (when optimising for speed) if the moves disappear during renaming anyway.
> PR117082 is about noreturn function:
> __attribute__ ((noreturn))
> void
> f3 (void)
> {
> int y0 = x0;
> int y1 = x1;
> f1 ();
> f2 (y0, y1);
> while (1);
> }
>
> Here the cost model is really wrong by assuming that entry and exit
> block have same frequencies. This can be fixed quite easilly (though it
> is a rare case)
Yeah (and nice example).
> PR118497 seems to be ixed.
>
> So overall I think
> 1) we can fix scaling of epilogue by exit block frequency
> to get noreturns right.
> 2) we should drop the check for optimize_size. Since with -Os
> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
> same way
> 3) we currently have wire in "-1" to biass the cost metric for callee
> saved registers.
> It may make sense to allow targets to control this, since i.e. x86
> has push/pop that is shorter. -3 would solve the testcase with neg
> and would express that push/pop is still cheaper with extra reg-reg
> move.
> 4) cost model misses shring wrapping, the fact that if register is
> callee saved it may be used by multiple allocnos and also that
> push/pop sequence may avoid need for manual RSP adjustments.
>
> Those seems bit harder things to fit in though.
>
> So if we want to go with the target hook, I think it should adjust the
> cost before scalling (since targets may have special tricks for
> prologues) rather than the scale factor (which is target independent
> part of cost model).
Like you say, one of the missing pieces appears to be the allocation/
dealloaction overhead for callee saves. Could we try to add a hook to
model that cost, based on certain parameters?
In particular, one thing that the examples above have in common is that
they don't need to allocate a frame for local variables. That seems
like it ought to be part of the mix. If we need to allocate a frame
using addition anyway, then presumably one of the advantages of push/pop
over callee saves goes away.
But going back to my question above, modelling the allocation and
deallocation would need to be done in a way that makes them more
expensive than moves. I think in some cases we've assumed that
addition and moves have equal cost, even when optimising for speed.
In other words, rather than add a hook to tweak the -1 bias (which I
still agree is a more reasonable thing to do than bypassing the IRA
code altogether), could we add a separate hook for the allocation/
deallocation and leave IRA to work out when to apply it?
I suppose for -fno-omit-frame-pointer we should also take the creation
of the frame link into account, if we don't already. This matters on
aarch64 since -fomit-frame-pointer is not the default at -O2.
One quirk on aarch64 is that, if we're using an odd number of
callee-saved GPRs, adding one more is essentially free, since we would
allocate space for it anyway, and could save and restore it alongside
the odd one out. That would probably be difficult to apply in practice
though. And it's obviously a separate issue from the current one.
Just mentioning it as another thing we could model in future.
Thanks,
Richard
Richard Sandiford <richard.sandiford@arm.com> writes:
> In particular, one thing that the examples above have in common is that
> they don't need to allocate a frame for local variables. That seems
> like it ought to be part of the mix. If we need to allocate a frame
> using addition anyway, then presumably one of the advantages of push/pop
> over callee saves goes away.
Gah, of course I meant caller saves.
> >
> > 0: 89 f8 mov %edi,%eax <--- move1
> > 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
> > 6: f7 d8 neg %eax
> > 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
> > c: 85 ff test %edi,%edi
> > e: 74 10 je 20 <test+0x20>
> > 10: e8 00 00 00 00 call 15 <test+0x15>
> > 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> > 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> > 1d: c3 ret
> > 1e: 66 90 xchg %ax,%ax
> > 20: e8 00 00 00 00 call 25 <test+0x25>
> > 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> > 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> > 2d: c3 ret
> >
> > This sequence really saves one move at the expense of of stack frame
> > allocation (which is not modelled by the cost model) and longer spill
> > code (also no modelled).
>
> Kind of a tangent, but is a cost of 2 reasonable for these reg<-reg moves,
> or is it too high? Reducing moves seems like the wrong thing to optimise
> for (when optimising for speed) if the moves disappear during renaming anyway.
The costs are scaled so 2 is cost of usual reg-reg move. This
definitely predates me, but it is how reload worked. It used cost 2 for
move and 1 was used to add various biasses (such as ! in constraints).
It used scale of 3^depth to avoid spilling in deeper loop nests and I
have introduced the frequency metric instead.
Loat/store costs are thus relative to reg-reg move cost divided by two.
For generic we set cost of load/store to be 6, which is usual latency of
the operation (3 cycles). On modern x86 architectures this is all
relative, since reg-reg move has good chance to have latency of 1
and store+load pairs may execute faster, too.
>
> Like you say, one of the missing pieces appears to be the allocation/
> dealloaction overhead for callee saves. Could we try to add a hook to
> model that cost, based on certain parameters?
>
> In particular, one thing that the examples above have in common is that
> they don't need to allocate a frame for local variables. That seems
> like it ought to be part of the mix. If we need to allocate a frame
> using addition anyway, then presumably one of the advantages of push/pop
> over callee saves goes away.
Even if you need to allocate frame, push/pop has advantage of short
encoding. But indeed it would be interesting to model the fact that
introducing frame has some cost.
On modern x86 the cost of stack adjustments is relatively low, since
they has stack engines.
>
> But going back to my question above, modelling the allocation and
> deallocation would need to be done in a way that makes them more
> expensive than moves. I think in some cases we've assumed that
> addition and moves have equal cost, even when optimising for speed.
Execution costs of stack adjustment are probalby not much bigger then
those of reg-reg move. so cost of 2 or 3 would make sense to me
>
> In other words, rather than add a hook to tweak the -1 bias (which I
> still agree is a more reasonable thing to do than bypassing the IRA
> code altogether), could we add a separate hook for the allocation/
> deallocation and leave IRA to work out when to apply it?
>
> I suppose for -fno-omit-frame-pointer we should also take the creation
> of the frame link into account, if we don't already. This matters on
> aarch64 since -fomit-frame-pointer is not the default at -O2.
>
> One quirk on aarch64 is that, if we're using an odd number of
> callee-saved GPRs, adding one more is essentially free, since we would
> allocate space for it anyway, and could save and restore it alongside
> the odd one out. That would probably be difficult to apply in practice
> though. And it's obviously a separate issue from the current one.
> Just mentioning it as another thing we could model in future.
Looks like fun, indeed.
Honza
>
> Thanks,
> Richard
Richard Sandiford <richard.sandiford@arm.com> writes:
> Really nice analysis! Thanks for writing this up.
>
> Sorry for the big quote below, but:
>
> Jan Hubicka <hubicka@ucw.cz> writes:
>> [...]
>> PR117081 is about regression in povray. The reducted testcase:
>>
>> void foo (void);
>> void bar (void);
>>
>> int
>> test (int a)
>> {
>> int r;
>>
>> if (r = -a)
>> foo ();
>> else
>> bar ();
>>
>> return r;
>> }
>>
>> shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
>> Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
>>
>> here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
>> multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
>> saved register and produces:
>>
>> 0000000000000000 <test>:
>> 0: 53 push %rbx <--- callee save
>> 1: 89 fb mov %edi,%ebx <--- move 1
>> 3: f7 db neg %ebx
>> 5: 74 09 je 10 <test+0x10>
>> 7: e8 00 00 00 00 call c <test+0xc>
>> c: 89 d8 mov %ebx,%eax <--- callee restore
>> e: 5b pop %rbx
>> f: c3 ret
>> 10: e8 00 00 00 00 call 15 <test+0x15>
>> 15: 89 d8 mov %ebx,%eax <--- move 2
>> 17: 5b pop %rbx <--- callee restore
>> 18: c3 ret
>>
>> Mainline used EAX since it has costs 13000. It is not 100% clear to me
>> why.
>> - 12000 is the spilling (which is emitted twice but executed just once)
>> - I would have expected 2000 for the move from edi to eax.
>> However even if cost is 14000 we will choose EAX. The code is:
>>
>> 0: 89 f8 mov %edi,%eax <--- move1
>> 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
>> 6: f7 d8 neg %eax
>> 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
>> c: 85 ff test %edi,%edi
>> e: 74 10 je 20 <test+0x20>
>> 10: e8 00 00 00 00 call 15 <test+0x15>
>> 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
>> 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
>> 1d: c3 ret
>> 1e: 66 90 xchg %ax,%ax
>> 20: e8 00 00 00 00 call 25 <test+0x25>
>> 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
>> 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
>> 2d: c3 ret
>>
>> This sequence really saves one move at the expense of of stack frame
>> allocation (which is not modelled by the cost model) and longer spill
>> code (also no modelled).
>> [...]
>> PR117082 is about noreturn function:
>> __attribute__ ((noreturn))
>> void
>> f3 (void)
>> {
>> int y0 = x0;
>> int y1 = x1;
>> f1 ();
>> f2 (y0, y1);
>> while (1);
>> }
>>
>> Here the cost model is really wrong by assuming that entry and exit
>> block have same frequencies. This can be fixed quite easilly (though it
>> is a rare case)
>
> Yeah (and nice example).
>
>> PR118497 seems to be ixed.
>>
>> So overall I think
>> 1) we can fix scaling of epilogue by exit block frequency
>> to get noreturns right.
>> 2) we should drop the check for optimize_size. Since with -Os
>> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
>> same way
>> 3) we currently have wire in "-1" to biass the cost metric for callee
>> saved registers.
>> It may make sense to allow targets to control this, since i.e. x86
>> has push/pop that is shorter. -3 would solve the testcase with neg
>> and would express that push/pop is still cheaper with extra reg-reg
>> move.
>> 4) cost model misses shring wrapping, the fact that if register is
>> callee saved it may be used by multiple allocnos and also that
>> push/pop sequence may avoid need for manual RSP adjustments.
>>
>> Those seems bit harder things to fit in though.
>>
>> So if we want to go with the target hook, I think it should adjust the
>> cost before scalling (since targets may have special tricks for
>> prologues) rather than the scale factor (which is target independent
>> part of cost model).
>
> Like you say, one of the missing pieces appears to be the allocation/
> dealloaction overhead for callee saves. Could we try to add a hook to
> model that cost, based on certain parameters?
>
> In particular, one thing that the examples above have in common is that
> they don't need to allocate a frame for local variables. That seems
> like it ought to be part of the mix. If we need to allocate a frame
> using addition anyway, then presumably one of the advantages of push/pop
> over callee saves goes away.
>
> But going back to my question above, modelling the allocation and
> deallocation would need to be done in a way that makes them more
> expensive than moves. I think in some cases we've assumed that
> addition and moves have equal cost, even when optimising for speed.
>
> In other words, rather than add a hook to tweak the -1 bias (which I
> still agree is a more reasonable thing to do than bypassing the IRA
> code altogether), could we add a separate hook for the allocation/
> deallocation and leave IRA to work out when to apply it?
>
> I suppose for -fno-omit-frame-pointer we should also take the creation
> of the frame link into account, if we don't already. This matters on
> aarch64 since -fomit-frame-pointer is not the default at -O2.
>
> One quirk on aarch64 is that, if we're using an odd number of
> callee-saved GPRs, adding one more is essentially free, since we would
> allocate space for it anyway, and could save and restore it alongside
> the odd one out. That would probably be difficult to apply in practice
> though. And it's obviously a separate issue from the current one.
> Just mentioning it as another thing we could model in future.
FWIW, here's a very rough initial version of the kind of thing
I was thinking about. Hopefully the hook documentation describes
the approach. It's deliberately (overly?) flexible.
I've included an aarch64 version that (a) models the fact that the
first caller-save can also allocate the frame more-or-less for free,
and (b) once we've saved an odd number of GPRs, saving one more is
essentialy free. I also hacked up an x86 version locally to model
the allocation benefits of using caller-saved registers. It seemed
to fix the povray example above.
This still needs a lot of clean-up and testing, but I thought I might
as well send what I have before leaving for the weekend. Does it look
reasonable in principle?
Thanks,
Richard
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c1e40200806..3f9453a74fb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15870,6 +15870,60 @@ aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
: base + aarch64_tune_params.memmov_cost.store_int);
}
+/* CALLEE_SAVED_REGS is the set of callee-saved registers that the
+ RA has already decided to use. Return the total number of GPRs
+ that need to be saved and restored, including the frame link registers. */
+static int
+aarch64_count_gpr_saves (const HARD_REG_SET &callee_saved_regs)
+{
+ auto saved_gprs = callee_saved_regs & reg_class_contents[GENERAL_REGS];
+ /* FIXME: popcount hack. */
+ auto nregs = popcount_hwi (saved_gprs.elts[0]);
+
+ if (aarch64_needs_frame_chain ())
+ nregs += 2;
+ else if (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM))
+ nregs += 1;
+ return nregs;
+}
+
+/* Implement TARGET_CALLEE_SAVE_COST. */
+static int
+aarch64_callee_save_cost (spill_cost_type, unsigned int regno,
+ machine_mode, unsigned int nregs, int mem_cost,
+ const HARD_REG_SET &callee_saved_regs,
+ bool existing_spill_p)
+{
+ auto ngprs = aarch64_count_gpr_saves (callee_saved_regs);
+
+ /* FIXME: handle FP and Advanced SIMD registers. */
+
+ /* If we've already committed to saving an odd number of GPRs, assume that
+ saving one more will involve turning an STR into an STP. */
+ if (GP_REGNUM_P (regno) && nregs == 1 && (ngprs & 1))
+ return 0;
+
+ /* If this would be the first register that we save, add the cost of
+ allocating or deallocating the frame. For GPR saves, the allocation
+ and deallocation can be folded into the save and restore. */
+ if (!existing_spill_p
+ && ngprs == 0
+ && !GP_REGNUM_P (regno))
+ mem_cost += 2;
+
+ return mem_cost;
+}
+
+/* Implement TARGET_FRAME_ALLOCATION_COST. */
+static int
+aarch64_frame_allocation_cost (frame_cost_type,
+ const HARD_REG_SET &callee_saved_regs)
+{
+ if (aarch64_count_gpr_saves (callee_saved_regs) == 0)
+ return 2;
+ return 0;
+}
+
/* Implement TARGET_INSN_COST. We have the opportunity to do something
much more productive here, such as using insn attributes to cost things.
But we don't, not yet.
@@ -31572,6 +31626,12 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_MEMORY_MOVE_COST
#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
+#undef TARGET_CALLEE_SAVE_COST
+#define TARGET_CALLEE_SAVE_COST aarch64_callee_save_cost
+
+#undef TARGET_FRAME_ALLOCATION_COST
+#define TARGET_FRAME_ALLOCATION_COST aarch64_frame_allocation_cost
+
#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 0de24eda6f0..171bb8c0b77 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7003,6 +7003,74 @@ value to the result of that function. The arguments to that function
are the same as to this target hook.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_CALLEE_SAVE_COST (spill_cost_type @var{cost_type}, unsigned int @var{hard_regno}, machine_mode @var{mode}, unsigned int @var{nregs}, int @var{mem_cost}, const HARD_REG_SET @var{&allocated_callee_regs}, bool @var{existing_spills_p})
+Return the one-off cost of saving or restoring callee-saved registers
+(also known as call-preserved register or non-volatile registers).
+The parameters are as follows:
+
+@itemize
+@item
+@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register
+and @samp{spill_cost_type::RESTORE} for restoring a register.
+
+@item
+@var{hard_regno} and @var{mode} represent the whole register that
+the register allocator is considering using; of these,
+@var{nregs} registers are fully or partially callee-saved.
+
+@item
+@var{mem_cost} is the normal cost for storing (for saves)
+or loading (for restores) the @var{nregs} registers.
+
+@item
+@var{allocated_callee_regs} is the set of callee-saved registers
+that are already in use.
+
+@item
+@var{existing_spills_p} is true if the register allocator has
+already decided to spill registers to memory.
+@end itemize
+
+If @var{existing_spills_p} is false, the cost of a save should account
+for frame allocations in a way that is consistent with
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for
+caller-saved registers. Similarly, the cost of a restore should then
+account for frame deallocations in a way that is consistent with
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.
+
+Note that this hook should not attempt to apply a frequency scale
+to the cost: it is the caller's responsibility to do that where
+appropriate.
+
+The default implementation returns @var{mem_cost}, plus the allocation
+or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},
+where appropriate.
+@end deftypefn
+
+@deftypefn {Target Hook} int TARGET_FRAME_ALLOCATION_COST (frame_cost_type @var{cost_type}, const HARD_REG_SET @var{&allocated_callee_regs})
+Return the cost of allocating or deallocating a frame for the sake of
+using caller-saved registers; @var{cost_type} chooses between allocation
+and deallocation.
+
+This hook is only called if the register allocator has not so far
+decided to spill any pseudo registers to memory or to use caller-saved
+registers for values that are live across a call. It may have
+decided to use callee-saved registers; if so, @var{allocated_callee_regs}
+is the set of callee-saved registers it has used. There might also be
+other reasons why a stack frame is already needed; for example,
+@samp{get_frame_size ()} may be nonzero, or the target might already
+require a frame for target-specific reasons.
+
+When the register allocator uses this hook to cost caller-saved registers,
+it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved
+registers, passing @samp{false} as the @var{existing_spills_p} argument.
+The intention is to allow the target to apply an apples-for-apples
+comparison between callee-saved and caller-saved registers when the
+allocator has not yet committed to using registers of both types.
+
+The default implementation returns 0.
+@end deftypefn
+
@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
A C expression for the cost of a branch instruction. A value of 1 is
the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 631d04131e3..eccc4d88493 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4582,6 +4582,10 @@ These macros are obsolete, new ports should use the target hook
@hook TARGET_MEMORY_MOVE_COST
+@hook TARGET_CALLEE_SAVE_COST
+
+@hook TARGET_FRAME_ALLOCATION_COST
+
@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
A C expression for the cost of a branch instruction. A value of 1 is
the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 0699b349a1a..9aaac794f35 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -1199,6 +1199,9 @@ finish_update_cost_records (void)
register was already allocated for an allocno. */
static bool allocated_hardreg_p[FIRST_PSEUDO_REGISTER];
+/* Which callee-saved hard registers we've decided to save. */
+static HARD_REG_SET allocated_callee_save_regs;
+
/* Describes one element in a queue of allocnos whose costs need to be
updated. Each allocno in the queue is known to have an allocno
class. */
@@ -1740,6 +1743,20 @@ check_hard_reg_p (ira_allocno_t a, int hard_regno,
return j == nregs;
}
+/* Record that we have allocated NREGS registers starting at HARD_REGNO. */
+
+static void
+record_allocation (int hard_regno, int nregs)
+{
+ for (int i = 0; i < nregs; ++i)
+ if (!allocated_hardreg_p[hard_regno + i])
+ {
+ allocated_hardreg_p[hard_regno + i] = true;
+ if (!crtl->abi->clobbers_full_reg_p (hard_regno + i))
+ SET_HARD_REG_BIT (allocated_callee_save_regs, hard_regno + i);
+ }
+}
+
/* Return number of registers needed to be saved and restored at
function prologue/epilogue if we allocate HARD_REGNO to hold value
of MODE. */
@@ -1961,6 +1978,14 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
#endif
auto_bitmap allocnos_to_spill;
HARD_REG_SET soft_conflict_regs = {};
+ int entry_freq = REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ int exit_freq = REG_FREQ_FROM_BB (EXIT_BLOCK_PTR_FOR_FN (cfun));
+ /* Whether we have spilled pseudos or used caller-saved registers
+ for values that are live across a call.
+
+ FIXME: also check for pseudos that prefer NO_REGS and not equivalences.
+ Also account for regular spills. */
+ bool existing_spills_p = caller_save_needed;
ira_assert (! ALLOCNO_ASSIGNED_P (a));
get_conflict_and_start_profitable_regs (a, retry_p,
@@ -2175,17 +2200,45 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
/* We need to save/restore the hard register in
epilogue/prologue. Therefore we increase the cost. */
{
+ int nregs = hard_regno_nregs (hard_regno, mode);
+ add_cost = 0;
rclass = REGNO_REG_CLASS (hard_regno);
- add_cost = ((ira_memory_move_cost[mode][rclass][0]
- + ira_memory_move_cost[mode][rclass][1])
- * saved_nregs / hard_regno_nregs (hard_regno,
- mode) - 1)
- * (optimize_size ? 1 :
- REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+
+ auto entry_cost = targetm.callee_save_cost
+ (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
+ ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
+ allocated_callee_save_regs, existing_spills_p);
+ add_cost += entry_cost * entry_freq;
+
+ auto exit_cost = targetm.callee_save_cost
+ (spill_cost_type::RESTORE, hard_regno, mode, saved_nregs,
+ ira_memory_move_cost[mode][rclass][1] * saved_nregs / nregs,
+ allocated_callee_save_regs, existing_spills_p);
+ add_cost += exit_cost * exit_freq;
+
+ /* In the event of a tie between caller-save and callee-save,
+ prefer callee-save. */
+ add_cost -= 1;
+
cost += add_cost;
full_cost += add_cost;
}
}
+ if (!existing_spills_p && ira_need_caller_save_p (a, hard_regno))
+ {
+ add_cost = 0;
+
+ auto entry_cost = targetm.frame_allocation_cost
+ (frame_cost_type::ALLOCATION, allocated_callee_save_regs);
+ add_cost += entry_cost * entry_freq;
+
+ auto exit_cost = targetm.frame_allocation_cost
+ (frame_cost_type::DEALLOCATION, allocated_callee_save_regs);
+ add_cost += exit_cost * exit_freq;
+
+ cost += add_cost;
+ full_cost += add_cost;
+ }
if (min_cost > cost)
min_cost = cost;
if (min_full_cost > full_cost)
@@ -2212,8 +2265,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
fail:
if (best_hard_regno >= 0)
{
- for (i = hard_regno_nregs (best_hard_regno, mode) - 1; i >= 0; i--)
- allocated_hardreg_p[best_hard_regno + i] = true;
+ record_allocation (best_hard_regno,
+ hard_regno_nregs (best_hard_regno, mode));
spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs,
best_hard_regno);
}
@@ -3369,8 +3422,7 @@ improve_allocation (void)
/* Assign the best chosen hard register to A. */
ALLOCNO_HARD_REGNO (a) = best;
- for (j = nregs - 1; j >= 0; j--)
- allocated_hardreg_p[best + j] = true;
+ record_allocation (best, nregs);
if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
fprintf (ira_dump_file, "Assigning %d to a%dr%d\n",
diff --git a/gcc/target.def b/gcc/target.def
index 4050b2ebdd4..fdd80c06c77 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3775,6 +3775,80 @@ are the same as to this target hook.",
int, (machine_mode mode, reg_class_t rclass, bool in),
default_memory_move_cost)
+DEFHOOK
+(callee_save_cost,
+ "Return the one-off cost of saving or restoring callee-saved registers\n\
+(also known as call-preserved register or non-volatile registers).\n\
+The parameters are as follows:\n\
+\n\
+@itemize\n\
+@item\n\
+@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register\n\
+and @samp{spill_cost_type::RESTORE} for restoring a register.\n\
+\n\
+@item\n\
+@var{hard_regno} and @var{mode} represent the whole register that\n\
+the register allocator is considering using; of these,\n\
+@var{nregs} registers are fully or partially callee-saved.\n\
+\n\
+@item\n\
+@var{mem_cost} is the normal cost for storing (for saves)\n\
+or loading (for restores) the @var{nregs} registers.\n\
+\n\
+@item\n\
+@var{allocated_callee_regs} is the set of callee-saved registers\n\
+that are already in use.\n\
+\n\
+@item\n\
+@var{existing_spills_p} is true if the register allocator has\n\
+already decided to spill registers to memory.\n\
+@end itemize\n\
+\n\
+If @var{existing_spills_p} is false, the cost of a save should account\n\
+for frame allocations in a way that is consistent with\n\
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for\n\
+caller-saved registers. Similarly, the cost of a restore should then\n\
+account for frame deallocations in a way that is consistent with\n\
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.\n\
+\n\
+Note that this hook should not attempt to apply a frequency scale\n\
+to the cost: it is the caller's responsibility to do that where\n\
+appropriate.\n\
+\n\
+The default implementation returns @var{mem_cost}, plus the allocation\n\
+or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},\n\
+where appropriate.",
+ int, (spill_cost_type cost_type, unsigned int hard_regno,
+ machine_mode mode, unsigned int nregs, int mem_cost,
+ const HARD_REG_SET &allocated_callee_regs, bool existing_spills_p),
+ default_callee_save_cost)
+
+DEFHOOK
+(frame_allocation_cost,
+ "Return the cost of allocating or deallocating a frame for the sake of\n\
+using caller-saved registers; @var{cost_type} chooses between allocation\n\
+and deallocation.\n\
+\n\
+This hook is only called if the register allocator has not so far\n\
+decided to spill any pseudo registers to memory or to use caller-saved\n\
+registers for values that are live across a call. It may have\n\
+decided to use callee-saved registers; if so, @var{allocated_callee_regs}\n\
+is the set of callee-saved registers it has used. There might also be\n\
+other reasons why a stack frame is already needed; for example,\n\
+@samp{get_frame_size ()} may be nonzero, or the target might already\n\
+require a frame for target-specific reasons.\n\
+\n\
+When the register allocator uses this hook to cost caller-saved registers,\n\
+it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved\n\
+registers, passing @samp{false} as the @var{existing_spills_p} argument.\n\
+The intention is to allow the target to apply an apples-for-apples\n\
+comparison between callee-saved and caller-saved registers when the\n\
+allocator has not yet committed to using registers of both types.\n\
+\n\
+The default implementation returns 0.",
+ int, (frame_cost_type cost_type, const HARD_REG_SET &allocated_callee_regs),
+ default_frame_allocation_cost)
+
DEFHOOK
(use_by_pieces_infrastructure_p,
"GCC will attempt several strategies when asked to copy between\n\
diff --git a/gcc/target.h b/gcc/target.h
index 3e1ee68a341..2bf35e2d0ee 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -284,6 +284,18 @@ enum poly_value_estimate_kind
POLY_VALUE_LIKELY
};
+enum class spill_cost_type
+{
+ SAVE,
+ RESTORE
+};
+
+enum class frame_cost_type
+{
+ ALLOCATION,
+ DEALLOCATION
+};
+
typedef void (*emit_support_tinfos_callback) (tree);
extern bool verify_type_context (location_t, type_context_kind, const_tree,
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index f80dc8b2e7e..a8331ba5576 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2075,6 +2075,29 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
#endif
}
+int
+default_callee_save_cost (spill_cost_type spill_type, unsigned int,
+ machine_mode, unsigned int, int mem_cost,
+ const HARD_REG_SET &callee_saved_regs,
+ bool existing_spills_p)
+{
+ if (!existing_spills_p)
+ {
+ auto frame_type = (spill_type == spill_cost_type::SAVE
+ ? frame_cost_type::ALLOCATION
+ : frame_cost_type::DEALLOCATION);
+ mem_cost += targetm.frame_allocation_cost (frame_type,
+ callee_saved_regs);
+ }
+ return mem_cost;
+}
+
+int
+default_frame_allocation_cost (frame_cost_type, const HARD_REG_SET &)
+{
+ return 0;
+}
+
/* The default implementation of TARGET_SLOW_UNALIGNED_ACCESS. */
bool
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 7d15f632c23..0c4c27e257e 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -234,6 +234,11 @@ extern tree default_builtin_tm_load_store (tree);
extern int default_memory_move_cost (machine_mode, reg_class_t, bool);
extern int default_register_move_cost (machine_mode, reg_class_t,
reg_class_t);
+extern int default_callee_save_cost (spill_cost_type, unsigned int,
+ machine_mode, unsigned int, int,
+ const HARD_REG_SET &, bool);
+extern int default_frame_allocation_cost (frame_cost_type,
+ const HARD_REG_SET &);
extern bool default_slow_unaligned_access (machine_mode, unsigned int);
extern HOST_WIDE_INT default_estimated_poly_value (poly_int64,
poly_value_estimate_kind);
On 2/6/25 5:35 PM, Jan Hubicka wrote:
>
> Register 3 (first caller saved) has cost 11000. This comes from:
> add_cost = ((ira_memory_move_cost[mode][rclass][0]
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> ^^
> here
> * (optimize_size ? 1 :
> REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>
> There is no comment why -1, but I suppose it is there to biass costs to
> use prologue/epilogue instad of caller save sequence when runtime cost
> estimate is even.
It is a very old code. As I remember RA w/o this avoided to use a
callee-saved reg at all in cases when it could be used for more one
pseudo. The idea was also that some targets typical savings/restores
are cheaper (pop/push, multiple reg lds/sts) and it should be taken into
account somehow.
>
>
> 0000000000000000 <test>:
> 0: 53 push %rbx <--- callee save
> 1: 89 fb mov %edi,%ebx <--- move 1
> 3: f7 db neg %ebx
> 5: 74 09 je 10 <test+0x10>
> 7: e8 00 00 00 00 call c <test+0xc>
> c: 89 d8 mov %ebx,%eax <--- callee restore
> e: 5b pop %rbx
> f: c3 ret
> 10: e8 00 00 00 00 call 15 <test+0x15>
> 15: 89 d8 mov %ebx,%eax <--- move 2
> 17: 5b pop %rbx <--- callee restore
> 18: c3 ret
>
> Mainline used EAX since it has costs 13000. It is not 100% clear to me
> why.
In many cases it is hard to find why this particular cost occurs as the
costs are updated dynamically and assignment a pseudo to hard reg can
affect cost for another pseudo which is not involved in a move with the
first pseudo (but involved through a chain of moves). Those are
complicated heuristics changed several times and verified by visible
SPEC performance improvements.
>
> So overall I think
> 1) we can fix scaling of epilogue by exit block frequency
> to get noreturns right.
> 2) we should drop the check for optimize_size. Since with -Os
> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
> same way
> 3) we currently have wire in "-1" to biass the cost metric for callee
> saved registers.
> It may make sense to allow targets to control this, since i.e. x86
> has push/pop that is shorter. -3 would solve the testcase with neg
> and would express that push/pop is still cheaper with extra reg-reg
> move.
> 4) cost model misses shring wrapping, the fact that if register is
> callee saved it may be used by multiple allocnos and also that
> push/pop sequence may avoid need for manual RSP adjustments.
Shrink wrapping was later addition to RA and I guess nobody thought how
to update cost model taking it into account.
> Those seems bit harder things to fit in though.
>
> So if we want to go with the target hook, I think it should adjust the
> cost before scalling (since targets may have special tricks for
> prologues) rather than the scale factor (which is target independent
> part of cost model).
Very nice analysis, Honza. I believe we still need a hook and I'll work
on the target hook improvement.
On Fri, Feb 7, 2025 at 9:20 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Sandiford <richard.sandiford@arm.com> writes:
> > Really nice analysis! Thanks for writing this up.
> >
> > Sorry for the big quote below, but:
> >
> > Jan Hubicka <hubicka@ucw.cz> writes:
> >> [...]
> >> PR117081 is about regression in povray. The reducted testcase:
> >>
> >> void foo (void);
> >> void bar (void);
> >>
> >> int
> >> test (int a)
> >> {
> >> int r;
> >>
> >> if (r = -a)
> >> foo ();
> >> else
> >> bar ();
> >>
> >> return r;
> >> }
> >>
> >> shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
> >> Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
> >>
> >> here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
> >> multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
> >> saved register and produces:
> >>
> >> 0000000000000000 <test>:
> >> 0: 53 push %rbx <--- callee save
> >> 1: 89 fb mov %edi,%ebx <--- move 1
> >> 3: f7 db neg %ebx
> >> 5: 74 09 je 10 <test+0x10>
> >> 7: e8 00 00 00 00 call c <test+0xc>
> >> c: 89 d8 mov %ebx,%eax <--- callee restore
> >> e: 5b pop %rbx
> >> f: c3 ret
> >> 10: e8 00 00 00 00 call 15 <test+0x15>
> >> 15: 89 d8 mov %ebx,%eax <--- move 2
> >> 17: 5b pop %rbx <--- callee restore
> >> 18: c3 ret
> >>
> >> Mainline used EAX since it has costs 13000. It is not 100% clear to me
> >> why.
> >> - 12000 is the spilling (which is emitted twice but executed just once)
> >> - I would have expected 2000 for the move from edi to eax.
> >> However even if cost is 14000 we will choose EAX. The code is:
> >>
> >> 0: 89 f8 mov %edi,%eax <--- move1
> >> 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
> >> 6: f7 d8 neg %eax
> >> 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
> >> c: 85 ff test %edi,%edi
> >> e: 74 10 je 20 <test+0x20>
> >> 10: e8 00 00 00 00 call 15 <test+0x15>
> >> 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> >> 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> >> 1d: c3 ret
> >> 1e: 66 90 xchg %ax,%ax
> >> 20: e8 00 00 00 00 call 25 <test+0x25>
> >> 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> >> 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> >> 2d: c3 ret
> >>
> >> This sequence really saves one move at the expense of of stack frame
> >> allocation (which is not modelled by the cost model) and longer spill
> >> code (also no modelled).
> >> [...]
> >> PR117082 is about noreturn function:
> >> __attribute__ ((noreturn))
> >> void
> >> f3 (void)
> >> {
> >> int y0 = x0;
> >> int y1 = x1;
> >> f1 ();
> >> f2 (y0, y1);
> >> while (1);
> >> }
> >>
> >> Here the cost model is really wrong by assuming that entry and exit
> >> block have same frequencies. This can be fixed quite easilly (though it
> >> is a rare case)
> >
> > Yeah (and nice example).
> >
> >> PR118497 seems to be ixed.
> >>
> >> So overall I think
> >> 1) we can fix scaling of epilogue by exit block frequency
> >> to get noreturns right.
> >> 2) we should drop the check for optimize_size. Since with -Os
> >> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
> >> same way
> >> 3) we currently have wire in "-1" to biass the cost metric for callee
> >> saved registers.
> >> It may make sense to allow targets to control this, since i.e. x86
> >> has push/pop that is shorter. -3 would solve the testcase with neg
> >> and would express that push/pop is still cheaper with extra reg-reg
> >> move.
> >> 4) cost model misses shring wrapping, the fact that if register is
> >> callee saved it may be used by multiple allocnos and also that
> >> push/pop sequence may avoid need for manual RSP adjustments.
> >>
> >> Those seems bit harder things to fit in though.
> >>
> >> So if we want to go with the target hook, I think it should adjust the
> >> cost before scalling (since targets may have special tricks for
> >> prologues) rather than the scale factor (which is target independent
> >> part of cost model).
> >
> > Like you say, one of the missing pieces appears to be the allocation/
> > dealloaction overhead for callee saves. Could we try to add a hook to
> > model that cost, based on certain parameters?
> >
> > In particular, one thing that the examples above have in common is that
> > they don't need to allocate a frame for local variables. That seems
> > like it ought to be part of the mix. If we need to allocate a frame
> > using addition anyway, then presumably one of the advantages of push/pop
> > over callee saves goes away.
> >
> > But going back to my question above, modelling the allocation and
> > deallocation would need to be done in a way that makes them more
> > expensive than moves. I think in some cases we've assumed that
> > addition and moves have equal cost, even when optimising for speed.
> >
> > In other words, rather than add a hook to tweak the -1 bias (which I
> > still agree is a more reasonable thing to do than bypassing the IRA
> > code altogether), could we add a separate hook for the allocation/
> > deallocation and leave IRA to work out when to apply it?
> >
> > I suppose for -fno-omit-frame-pointer we should also take the creation
> > of the frame link into account, if we don't already. This matters on
> > aarch64 since -fomit-frame-pointer is not the default at -O2.
> >
> > One quirk on aarch64 is that, if we're using an odd number of
> > callee-saved GPRs, adding one more is essentially free, since we would
> > allocate space for it anyway, and could save and restore it alongside
> > the odd one out. That would probably be difficult to apply in practice
> > though. And it's obviously a separate issue from the current one.
> > Just mentioning it as another thing we could model in future.
>
> FWIW, here's a very rough initial version of the kind of thing
> I was thinking about. Hopefully the hook documentation describes
> the approach. It's deliberately (overly?) flexible.
>
> I've included an aarch64 version that (a) models the fact that the
> first caller-save can also allocate the frame more-or-less for free,
> and (b) once we've saved an odd number of GPRs, saving one more is
> essentialy free. I also hacked up an x86 version locally to model
> the allocation benefits of using caller-saved registers. It seemed
> to fix the povray example above.
Note the pair allocation for aarch64 was filed PR 117477 so it would
be useful to check if it fixes that and include the bug # in the
commit if it does.
Thanks,
Andrew
>
> This still needs a lot of clean-up and testing, but I thought I might
> as well send what I have before leaving for the weekend. Does it look
> reasonable in principle?
>
> Thanks,
> Richard
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c1e40200806..3f9453a74fb 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -15870,6 +15870,60 @@ aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
> : base + aarch64_tune_params.memmov_cost.store_int);
> }
>
> +/* CALLEE_SAVED_REGS is the set of callee-saved registers that the
> + RA has already decided to use. Return the total number of GPRs
> + that need to be saved and restored, including the frame link registers. */
> +static int
> +aarch64_count_gpr_saves (const HARD_REG_SET &callee_saved_regs)
> +{
> + auto saved_gprs = callee_saved_regs & reg_class_contents[GENERAL_REGS];
> + /* FIXME: popcount hack. */
> + auto nregs = popcount_hwi (saved_gprs.elts[0]);
> +
> + if (aarch64_needs_frame_chain ())
> + nregs += 2;
> + else if (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM))
> + nregs += 1;
> + return nregs;
> +}
> +
> +/* Implement TARGET_CALLEE_SAVE_COST. */
> +static int
> +aarch64_callee_save_cost (spill_cost_type, unsigned int regno,
> + machine_mode, unsigned int nregs, int mem_cost,
> + const HARD_REG_SET &callee_saved_regs,
> + bool existing_spill_p)
> +{
> + auto ngprs = aarch64_count_gpr_saves (callee_saved_regs);
> +
> + /* FIXME: handle FP and Advanced SIMD registers. */
> +
> + /* If we've already committed to saving an odd number of GPRs, assume that
> + saving one more will involve turning an STR into an STP. */
> + if (GP_REGNUM_P (regno) && nregs == 1 && (ngprs & 1))
> + return 0;
> +
> + /* If this would be the first register that we save, add the cost of
> + allocating or deallocating the frame. For GPR saves, the allocation
> + and deallocation can be folded into the save and restore. */
> + if (!existing_spill_p
> + && ngprs == 0
> + && !GP_REGNUM_P (regno))
> + mem_cost += 2;
> +
> + return mem_cost;
> +}
> +
> +/* Implement TARGET_FRAME_ALLOCATION_COST. */
> +static int
> +aarch64_frame_allocation_cost (frame_cost_type,
> + const HARD_REG_SET &callee_saved_regs)
> +{
> + if (aarch64_count_gpr_saves (callee_saved_regs) == 0)
> + return 2;
> + return 0;
> +}
> +
> /* Implement TARGET_INSN_COST. We have the opportunity to do something
> much more productive here, such as using insn attributes to cost things.
> But we don't, not yet.
> @@ -31572,6 +31626,12 @@ aarch64_libgcc_floating_mode_supported_p
> #undef TARGET_MEMORY_MOVE_COST
> #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
>
> +#undef TARGET_CALLEE_SAVE_COST
> +#define TARGET_CALLEE_SAVE_COST aarch64_callee_save_cost
> +
> +#undef TARGET_FRAME_ALLOCATION_COST
> +#define TARGET_FRAME_ALLOCATION_COST aarch64_frame_allocation_cost
> +
> #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
> #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..171bb8c0b77 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -7003,6 +7003,74 @@ value to the result of that function. The arguments to that function
> are the same as to this target hook.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_CALLEE_SAVE_COST (spill_cost_type @var{cost_type}, unsigned int @var{hard_regno}, machine_mode @var{mode}, unsigned int @var{nregs}, int @var{mem_cost}, const HARD_REG_SET @var{&allocated_callee_regs}, bool @var{existing_spills_p})
> +Return the one-off cost of saving or restoring callee-saved registers
> +(also known as call-preserved register or non-volatile registers).
> +The parameters are as follows:
> +
> +@itemize
> +@item
> +@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register
> +and @samp{spill_cost_type::RESTORE} for restoring a register.
> +
> +@item
> +@var{hard_regno} and @var{mode} represent the whole register that
> +the register allocator is considering using; of these,
> +@var{nregs} registers are fully or partially callee-saved.
> +
> +@item
> +@var{mem_cost} is the normal cost for storing (for saves)
> +or loading (for restores) the @var{nregs} registers.
> +
> +@item
> +@var{allocated_callee_regs} is the set of callee-saved registers
> +that are already in use.
> +
> +@item
> +@var{existing_spills_p} is true if the register allocator has
> +already decided to spill registers to memory.
> +@end itemize
> +
> +If @var{existing_spills_p} is false, the cost of a save should account
> +for frame allocations in a way that is consistent with
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for
> +caller-saved registers. Similarly, the cost of a restore should then
> +account for frame deallocations in a way that is consistent with
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.
> +
> +Note that this hook should not attempt to apply a frequency scale
> +to the cost: it is the caller's responsibility to do that where
> +appropriate.
> +
> +The default implementation returns @var{mem_cost}, plus the allocation
> +or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},
> +where appropriate.
> +@end deftypefn
> +
> +@deftypefn {Target Hook} int TARGET_FRAME_ALLOCATION_COST (frame_cost_type @var{cost_type}, const HARD_REG_SET @var{&allocated_callee_regs})
> +Return the cost of allocating or deallocating a frame for the sake of
> +using caller-saved registers; @var{cost_type} chooses between allocation
> +and deallocation.
> +
> +This hook is only called if the register allocator has not so far
> +decided to spill any pseudo registers to memory or to use caller-saved
> +registers for values that are live across a call. It may have
> +decided to use callee-saved registers; if so, @var{allocated_callee_regs}
> +is the set of callee-saved registers it has used. There might also be
> +other reasons why a stack frame is already needed; for example,
> +@samp{get_frame_size ()} may be nonzero, or the target might already
> +require a frame for target-specific reasons.
> +
> +When the register allocator uses this hook to cost caller-saved registers,
> +it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved
> +registers, passing @samp{false} as the @var{existing_spills_p} argument.
> +The intention is to allow the target to apply an apples-for-apples
> +comparison between callee-saved and caller-saved registers when the
> +allocator has not yet committed to using registers of both types.
> +
> +The default implementation returns 0.
> +@end deftypefn
> +
> @defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
> A C expression for the cost of a branch instruction. A value of 1 is
> the default; other values are interpreted relative to that. Parameter
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..eccc4d88493 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4582,6 +4582,10 @@ These macros are obsolete, new ports should use the target hook
>
> @hook TARGET_MEMORY_MOVE_COST
>
> +@hook TARGET_CALLEE_SAVE_COST
> +
> +@hook TARGET_FRAME_ALLOCATION_COST
> +
> @defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
> A C expression for the cost of a branch instruction. A value of 1 is
> the default; other values are interpreted relative to that. Parameter
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..9aaac794f35 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -1199,6 +1199,9 @@ finish_update_cost_records (void)
> register was already allocated for an allocno. */
> static bool allocated_hardreg_p[FIRST_PSEUDO_REGISTER];
>
> +/* Which callee-saved hard registers we've decided to save. */
> +static HARD_REG_SET allocated_callee_save_regs;
> +
> /* Describes one element in a queue of allocnos whose costs need to be
> updated. Each allocno in the queue is known to have an allocno
> class. */
> @@ -1740,6 +1743,20 @@ check_hard_reg_p (ira_allocno_t a, int hard_regno,
> return j == nregs;
> }
>
> +/* Record that we have allocated NREGS registers starting at HARD_REGNO. */
> +
> +static void
> +record_allocation (int hard_regno, int nregs)
> +{
> + for (int i = 0; i < nregs; ++i)
> + if (!allocated_hardreg_p[hard_regno + i])
> + {
> + allocated_hardreg_p[hard_regno + i] = true;
> + if (!crtl->abi->clobbers_full_reg_p (hard_regno + i))
> + SET_HARD_REG_BIT (allocated_callee_save_regs, hard_regno + i);
> + }
> +}
> +
> /* Return number of registers needed to be saved and restored at
> function prologue/epilogue if we allocate HARD_REGNO to hold value
> of MODE. */
> @@ -1961,6 +1978,14 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> #endif
> auto_bitmap allocnos_to_spill;
> HARD_REG_SET soft_conflict_regs = {};
> + int entry_freq = REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun));
> + int exit_freq = REG_FREQ_FROM_BB (EXIT_BLOCK_PTR_FOR_FN (cfun));
> + /* Whether we have spilled pseudos or used caller-saved registers
> + for values that are live across a call.
> +
> + FIXME: also check for pseudos that prefer NO_REGS and not equivalences.
> + Also account for regular spills. */
> + bool existing_spills_p = caller_save_needed;
>
> ira_assert (! ALLOCNO_ASSIGNED_P (a));
> get_conflict_and_start_profitable_regs (a, retry_p,
> @@ -2175,17 +2200,45 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> /* We need to save/restore the hard register in
> epilogue/prologue. Therefore we increase the cost. */
> {
> + int nregs = hard_regno_nregs (hard_regno, mode);
> + add_cost = 0;
> rclass = REGNO_REG_CLASS (hard_regno);
> - add_cost = ((ira_memory_move_cost[mode][rclass][0]
> - + ira_memory_move_cost[mode][rclass][1])
> - * saved_nregs / hard_regno_nregs (hard_regno,
> - mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +
> + auto entry_cost = targetm.callee_save_cost
> + (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
> + ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
> + allocated_callee_save_regs, existing_spills_p);
> + add_cost += entry_cost * entry_freq;
> +
> + auto exit_cost = targetm.callee_save_cost
> + (spill_cost_type::RESTORE, hard_regno, mode, saved_nregs,
> + ira_memory_move_cost[mode][rclass][1] * saved_nregs / nregs,
> + allocated_callee_save_regs, existing_spills_p);
> + add_cost += exit_cost * exit_freq;
> +
> + /* In the event of a tie between caller-save and callee-save,
> + prefer callee-save. */
> + add_cost -= 1;
> +
> cost += add_cost;
> full_cost += add_cost;
> }
> }
> + if (!existing_spills_p && ira_need_caller_save_p (a, hard_regno))
> + {
> + add_cost = 0;
> +
> + auto entry_cost = targetm.frame_allocation_cost
> + (frame_cost_type::ALLOCATION, allocated_callee_save_regs);
> + add_cost += entry_cost * entry_freq;
> +
> + auto exit_cost = targetm.frame_allocation_cost
> + (frame_cost_type::DEALLOCATION, allocated_callee_save_regs);
> + add_cost += exit_cost * exit_freq;
> +
> + cost += add_cost;
> + full_cost += add_cost;
> + }
> if (min_cost > cost)
> min_cost = cost;
> if (min_full_cost > full_cost)
> @@ -2212,8 +2265,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> fail:
> if (best_hard_regno >= 0)
> {
> - for (i = hard_regno_nregs (best_hard_regno, mode) - 1; i >= 0; i--)
> - allocated_hardreg_p[best_hard_regno + i] = true;
> + record_allocation (best_hard_regno,
> + hard_regno_nregs (best_hard_regno, mode));
> spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs,
> best_hard_regno);
> }
> @@ -3369,8 +3422,7 @@ improve_allocation (void)
> /* Assign the best chosen hard register to A. */
> ALLOCNO_HARD_REGNO (a) = best;
>
> - for (j = nregs - 1; j >= 0; j--)
> - allocated_hardreg_p[best + j] = true;
> + record_allocation (best, nregs);
>
> if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
> fprintf (ira_dump_file, "Assigning %d to a%dr%d\n",
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..fdd80c06c77 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -3775,6 +3775,80 @@ are the same as to this target hook.",
> int, (machine_mode mode, reg_class_t rclass, bool in),
> default_memory_move_cost)
>
> +DEFHOOK
> +(callee_save_cost,
> + "Return the one-off cost of saving or restoring callee-saved registers\n\
> +(also known as call-preserved register or non-volatile registers).\n\
> +The parameters are as follows:\n\
> +\n\
> +@itemize\n\
> +@item\n\
> +@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register\n\
> +and @samp{spill_cost_type::RESTORE} for restoring a register.\n\
> +\n\
> +@item\n\
> +@var{hard_regno} and @var{mode} represent the whole register that\n\
> +the register allocator is considering using; of these,\n\
> +@var{nregs} registers are fully or partially callee-saved.\n\
> +\n\
> +@item\n\
> +@var{mem_cost} is the normal cost for storing (for saves)\n\
> +or loading (for restores) the @var{nregs} registers.\n\
> +\n\
> +@item\n\
> +@var{allocated_callee_regs} is the set of callee-saved registers\n\
> +that are already in use.\n\
> +\n\
> +@item\n\
> +@var{existing_spills_p} is true if the register allocator has\n\
> +already decided to spill registers to memory.\n\
> +@end itemize\n\
> +\n\
> +If @var{existing_spills_p} is false, the cost of a save should account\n\
> +for frame allocations in a way that is consistent with\n\
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for\n\
> +caller-saved registers. Similarly, the cost of a restore should then\n\
> +account for frame deallocations in a way that is consistent with\n\
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.\n\
> +\n\
> +Note that this hook should not attempt to apply a frequency scale\n\
> +to the cost: it is the caller's responsibility to do that where\n\
> +appropriate.\n\
> +\n\
> +The default implementation returns @var{mem_cost}, plus the allocation\n\
> +or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},\n\
> +where appropriate.",
> + int, (spill_cost_type cost_type, unsigned int hard_regno,
> + machine_mode mode, unsigned int nregs, int mem_cost,
> + const HARD_REG_SET &allocated_callee_regs, bool existing_spills_p),
> + default_callee_save_cost)
> +
> +DEFHOOK
> +(frame_allocation_cost,
> + "Return the cost of allocating or deallocating a frame for the sake of\n\
> +using caller-saved registers; @var{cost_type} chooses between allocation\n\
> +and deallocation.\n\
> +\n\
> +This hook is only called if the register allocator has not so far\n\
> +decided to spill any pseudo registers to memory or to use caller-saved\n\
> +registers for values that are live across a call. It may have\n\
> +decided to use callee-saved registers; if so, @var{allocated_callee_regs}\n\
> +is the set of callee-saved registers it has used. There might also be\n\
> +other reasons why a stack frame is already needed; for example,\n\
> +@samp{get_frame_size ()} may be nonzero, or the target might already\n\
> +require a frame for target-specific reasons.\n\
> +\n\
> +When the register allocator uses this hook to cost caller-saved registers,\n\
> +it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved\n\
> +registers, passing @samp{false} as the @var{existing_spills_p} argument.\n\
> +The intention is to allow the target to apply an apples-for-apples\n\
> +comparison between callee-saved and caller-saved registers when the\n\
> +allocator has not yet committed to using registers of both types.\n\
> +\n\
> +The default implementation returns 0.",
> + int, (frame_cost_type cost_type, const HARD_REG_SET &allocated_callee_regs),
> + default_frame_allocation_cost)
> +
> DEFHOOK
> (use_by_pieces_infrastructure_p,
> "GCC will attempt several strategies when asked to copy between\n\
> diff --git a/gcc/target.h b/gcc/target.h
> index 3e1ee68a341..2bf35e2d0ee 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -284,6 +284,18 @@ enum poly_value_estimate_kind
> POLY_VALUE_LIKELY
> };
>
> +enum class spill_cost_type
> +{
> + SAVE,
> + RESTORE
> +};
> +
> +enum class frame_cost_type
> +{
> + ALLOCATION,
> + DEALLOCATION
> +};
> +
> typedef void (*emit_support_tinfos_callback) (tree);
>
> extern bool verify_type_context (location_t, type_context_kind, const_tree,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..a8331ba5576 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -2075,6 +2075,29 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
> #endif
> }
>
> +int
> +default_callee_save_cost (spill_cost_type spill_type, unsigned int,
> + machine_mode, unsigned int, int mem_cost,
> + const HARD_REG_SET &callee_saved_regs,
> + bool existing_spills_p)
> +{
> + if (!existing_spills_p)
> + {
> + auto frame_type = (spill_type == spill_cost_type::SAVE
> + ? frame_cost_type::ALLOCATION
> + : frame_cost_type::DEALLOCATION);
> + mem_cost += targetm.frame_allocation_cost (frame_type,
> + callee_saved_regs);
> + }
> + return mem_cost;
> +}
> +
> +int
> +default_frame_allocation_cost (frame_cost_type, const HARD_REG_SET &)
> +{
> + return 0;
> +}
> +
> /* The default implementation of TARGET_SLOW_UNALIGNED_ACCESS. */
>
> bool
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..0c4c27e257e 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -234,6 +234,11 @@ extern tree default_builtin_tm_load_store (tree);
> extern int default_memory_move_cost (machine_mode, reg_class_t, bool);
> extern int default_register_move_cost (machine_mode, reg_class_t,
> reg_class_t);
> +extern int default_callee_save_cost (spill_cost_type, unsigned int,
> + machine_mode, unsigned int, int,
> + const HARD_REG_SET &, bool);
> +extern int default_frame_allocation_cost (frame_cost_type,
> + const HARD_REG_SET &);
> extern bool default_slow_unaligned_access (machine_mode, unsigned int);
> extern HOST_WIDE_INT default_estimated_poly_value (poly_int64,
> poly_value_estimate_kind);
> --
> 2.25.1
>
Richard Sandiford <richard.sandiford@arm.com> writes:
> FWIW, here's a very rough initial version of the kind of thing
> I was thinking about. Hopefully the hook documentation describes
> the approach. It's deliberately (overly?) flexible.
Argh! I forgot:
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index de34be31f48..4b1e7eff41f 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -5253,6 +5253,7 @@ color (void)
{
allocno_stack_vec.create (ira_allocnos_num);
memset (allocated_hardreg_p, 0, sizeof (allocated_hardreg_p));
+ CLEAR_HARD_REG_SET (allocated_callee_save_regs);
ira_initiate_assign ();
do_coloring ();
ira_finish_assign ();
> This still needs a lot of clean-up and testing, but I thought I might
> as well send what I have before leaving for the weekend. Does it look
> reasonable in principle?
To be clear, "testing" of course includes performance testing
(SPEC, but alo other things).
Thanks,
Richard
On Sun, Feb 2, 2025 at 10:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets. As the result, callee-saved registers
> may not be used to preserve local variable values across calls on some
> targets, like x86. Add a target hook for the callee-saved register cost
> scale in epilogue and prologue used by IRA. The default version of this
> target hook returns 1 if optimizing for size, otherwise returns the entry
> block frequency. Add an x86 version of this target hook to restore the
> old behavior prior to the above commit.
Hi,
As mentioned by others, this is not the right thing to do for x86_64.
Plus it sometimes causes shrink wrapping NOT to happen. See PR 110008
which I had to reopen as being broken by this.
Thanks,
Andrew
>
> PR rtl-optimization/111673
> PR rtl-optimization/115932
> PR rtl-optimization/116028
> PR rtl-optimization/117081
> PR rtl-optimization/117082
> PR rtl-optimization/118497
> * ira-color.cc (assign_hard_reg): Call the target hook for the
> callee-saved register cost scale in epilogue and prologue.
> * target.def (ira_callee_saved_register_cost_scale): New target
> hook.
> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> New.
> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> New.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> * doc/tm.texi: Regenerated.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> New.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386.cc | 11 +++++++++++
> gcc/doc/tm.texi | 8 ++++++++
> gcc/doc/tm.texi.in | 2 ++
> gcc/ira-color.cc | 3 +--
> gcc/target.def | 12 ++++++++++++
> gcc/targhooks.cc | 8 ++++++++
> gcc/targhooks.h | 1 +
> 7 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f89201684a8..3128973ba79 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +
> +static int
> +ix86_ira_callee_saved_register_cost_scale (int)
> +{
> + return 1;
> +}
> +
> /* Return true if a set of DST by the expression SRC should be allowed.
> This prevents complex sets of likely_spilled hard regs before split1. */
>
> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> + ix86_ira_callee_saved_register_cost_scale
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..9f42913a4ef 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> The default version of this target hook always returns given class.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> +A target hook which returns the callee-saved register @var{hard_regno}
> +cost scale in epilogue and prologue used by IRA.
> +
> +The default version of this target hook returns 1 if optimizing for
> +size, otherwise returns the entry block frequency.
> +@end deftypefn
> +
> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> A target hook which returns true if we use LRA instead of reload pass.
>
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..6dbe22581ca 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -2388,6 +2388,8 @@ in the reload pass.
>
> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>
> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +
> @hook TARGET_LRA_P
>
> @hook TARGET_REGISTER_PRIORITY
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..233060e1587 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> cost += add_cost;
> full_cost += add_cost;
> }
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..c348b15815a 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -5714,6 +5714,18 @@ DEFHOOK
> reg_class_t, (int, reg_class_t, reg_class_t),
> default_ira_change_pseudo_allocno_class)
>
> +/* Scale of callee-saved register cost in epilogue and prologue used by
> + IRA. */
> +DEFHOOK
> +(ira_callee_saved_register_cost_scale,
> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> +cost scale in epilogue and prologue used by IRA.\n\
> +\n\
> +The default version of this target hook returns 1 if optimizing for\n\
> +size, otherwise returns the entry block frequency.",
> + int, (int hard_regno),
> + default_ira_callee_saved_register_cost_scale)
> +
> /* Return true if we use LRA instead of reload. */
> DEFHOOK
> (lra_p,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..344075efa41 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> return cl;
> }
>
> +int
> +default_ira_callee_saved_register_cost_scale (int)
> +{
> + return (optimize_size
> + ? 1
> + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +}
> +
> extern bool
> default_lra_p (void)
> {
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..8871e01430c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> reg_class_t);
> +extern int default_ira_callee_saved_register_cost_scale (int);
> extern bool default_lra_p (void);
> extern int default_register_priority (int);
> extern bool default_register_usage_leveling_p (void);
> --
> 2.48.1
>
On Mon, Feb 10, 2025 at 9:56 AM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Sun, Feb 2, 2025 at 10:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > Date: Tue Jun 25 08:37:49 2024 -0500
> >
> > ira: Scale save/restore costs of callee save registers with block frequency
> >
> > scales the cost of saving/restoring a callee-save hard register in epilogue
> > and prologue with the entry block frequency, which, if not optimizing for
> > size, is 10000, for all targets. As the result, callee-saved registers
> > may not be used to preserve local variable values across calls on some
> > targets, like x86. Add a target hook for the callee-saved register cost
> > scale in epilogue and prologue used by IRA. The default version of this
> > target hook returns 1 if optimizing for size, otherwise returns the entry
> > block frequency. Add an x86 version of this target hook to restore the
> > old behavior prior to the above commit.
>
>
> Hi,
> As mentioned by others, this is not the right thing to do for x86_64.
> Plus it sometimes causes shrink wrapping NOT to happen. See PR 110008
> which I had to reopen as being broken by this.
The testcase in
https://gcc.gnu.org/pipermail/gcc-patches/2025-January/673875.html
also fails now too for x86_64 and that is from real code unlike the
other testcases.
I would say revert the hook and fix the issue some other way.
Thanks,
Andrew
>
> Thanks,
> Andrew
>
>
> >
> > PR rtl-optimization/111673
> > PR rtl-optimization/115932
> > PR rtl-optimization/116028
> > PR rtl-optimization/117081
> > PR rtl-optimization/117082
> > PR rtl-optimization/118497
> > * ira-color.cc (assign_hard_reg): Call the target hook for the
> > callee-saved register cost scale in epilogue and prologue.
> > * target.def (ira_callee_saved_register_cost_scale): New target
> > hook.
> > * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> > New.
> > * targhooks.h (default_ira_callee_saved_register_cost_scale):
> > Likewise.
> > * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> > New.
> > (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> > * doc/tm.texi: Regenerated.
> > * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> > New.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > gcc/config/i386/i386.cc | 11 +++++++++++
> > gcc/doc/tm.texi | 8 ++++++++
> > gcc/doc/tm.texi.in | 2 ++
> > gcc/ira-color.cc | 3 +--
> > gcc/target.def | 12 ++++++++++++
> > gcc/targhooks.cc | 8 ++++++++
> > gcc/targhooks.h | 1 +
> > 7 files changed, 43 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index f89201684a8..3128973ba79 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> > return false;
> > }
> >
> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > +
> > +static int
> > +ix86_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return 1;
> > +}
> > +
> > /* Return true if a set of DST by the expression SRC should be allowed.
> > This prevents complex sets of likely_spilled hard regs before split1. */
> >
> > @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> > #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> > #undef TARGET_CLASS_LIKELY_SPILLED_P
> > #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> > +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> > + ix86_ira_callee_saved_register_cost_scale
> >
> > #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> > #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 0de24eda6f0..9f42913a4ef 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> > The default version of this target hook always returns given class.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> > +A target hook which returns the callee-saved register @var{hard_regno}
> > +cost scale in epilogue and prologue used by IRA.
> > +
> > +The default version of this target hook returns 1 if optimizing for
> > +size, otherwise returns the entry block frequency.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> > A target hook which returns true if we use LRA instead of reload pass.
> >
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index 631d04131e3..6dbe22581ca 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -2388,6 +2388,8 @@ in the reload pass.
> >
> > @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> >
> > +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +
> > @hook TARGET_LRA_P
> >
> > @hook TARGET_REGISTER_PRIORITY
> > diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> > index 0699b349a1a..233060e1587 100644
> > --- a/gcc/ira-color.cc
> > +++ b/gcc/ira-color.cc
> > @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> > + ira_memory_move_cost[mode][rclass][1])
> > * saved_nregs / hard_regno_nregs (hard_regno,
> > mode) - 1)
> > - * (optimize_size ? 1 :
> > - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> > cost += add_cost;
> > full_cost += add_cost;
> > }
> > diff --git a/gcc/target.def b/gcc/target.def
> > index 4050b2ebdd4..c348b15815a 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -5714,6 +5714,18 @@ DEFHOOK
> > reg_class_t, (int, reg_class_t, reg_class_t),
> > default_ira_change_pseudo_allocno_class)
> >
> > +/* Scale of callee-saved register cost in epilogue and prologue used by
> > + IRA. */
> > +DEFHOOK
> > +(ira_callee_saved_register_cost_scale,
> > + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> > +cost scale in epilogue and prologue used by IRA.\n\
> > +\n\
> > +The default version of this target hook returns 1 if optimizing for\n\
> > +size, otherwise returns the entry block frequency.",
> > + int, (int hard_regno),
> > + default_ira_callee_saved_register_cost_scale)
> > +
> > /* Return true if we use LRA instead of reload. */
> > DEFHOOK
> > (lra_p,
> > diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> > index f80dc8b2e7e..344075efa41 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> > return cl;
> > }
> >
> > +int
> > +default_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return (optimize_size
> > + ? 1
> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > +}
> > +
> > extern bool
> > default_lra_p (void)
> > {
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> > index 7d15f632c23..8871e01430c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> > extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> > extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> > reg_class_t);
> > +extern int default_ira_callee_saved_register_cost_scale (int);
> > extern bool default_lra_p (void);
> > extern int default_register_priority (int);
> > extern bool default_register_usage_leveling_p (void);
> > --
> > 2.48.1
> >
On Tue, Feb 11, 2025 at 4:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> > PR117081 is about regression in povray. The reducted testcase:
> Just for clarification. PR117081 is not about regression in povray.
> it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
> testl
> The pr91384.c is added by r12-7417 which is peephole optimization
> expecting some specific instruction sequence, the regression can be
> fixed by adding new peephole pattern.
>
> H.J patch actually regressed povray by introducing extra push/pops
> (since it adds preference for callee save registers, in the benchmark
> using caller saved registers is much better).
> Sorry, I may not have been clear in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
>
My patch doesn't change the codegen for that code as shown by
commit 846837c2406ae7a52d9123b29c13e4b8b9d14224
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Feb 7 13:49:30 2025 +0800
x86: Verify that PUSH/POP can be skipped
> PR117081 is about regression in povray. The reducted testcase:
Just for clarification. PR117081 is not about regression in povray.
it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
testl
The pr91384.c is added by r12-7417 which is peephole optimization
expecting some specific instruction sequence, the regression can be
fixed by adding new peephole pattern.
H.J patch actually regressed povray by introducing extra push/pops
(since it adds preference for callee save registers, in the benchmark
using caller saved registers is much better).
Sorry, I may not have been clear in
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
On Tue, Feb 11, 2025 at 4:38 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Feb 11, 2025 at 4:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Feb 11, 2025 at 4:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > > PR117081 is about regression in povray. The reducted testcase:
> > > Just for clarification. PR117081 is not about regression in povray.
> > > it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
> > > testl
> > > The pr91384.c is added by r12-7417 which is peephole optimization
> > > expecting some specific instruction sequence, the regression can be
> > > fixed by adding new peephole pattern.
> > >
> > > H.J patch actually regressed povray by introducing extra push/pops
> > > (since it adds preference for callee save registers, in the benchmark
> > > using caller saved registers is much better).
> > > Sorry, I may not have been clear in
> > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
> > >
> >
> > My patch doesn't change the codegen for that code as shown by
> Real benchmark scenarios are a little more complex, the testcase in
> the PR is just one of the scenes, but not all.
> We are currently investigating this case and hope to find a better solution.
We need testcases to make sure that there are no regressions.
> >
> > commit 846837c2406ae7a52d9123b29c13e4b8b9d14224
> > Author: H.J. Lu <hjl.tools@gmail.com>
> > Date: Fri Feb 7 13:49:30 2025 +0800
> >
> > x86: Verify that PUSH/POP can be skipped
> >
> >
> > --
> > H.J.
>
>
>
> --
> BR,
> Hongtao
On Tue, Feb 11, 2025 at 4:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Feb 11, 2025 at 4:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > > PR117081 is about regression in povray. The reducted testcase:
> > Just for clarification. PR117081 is not about regression in povray.
> > it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
> > testl
> > The pr91384.c is added by r12-7417 which is peephole optimization
> > expecting some specific instruction sequence, the regression can be
> > fixed by adding new peephole pattern.
> >
> > H.J patch actually regressed povray by introducing extra push/pops
> > (since it adds preference for callee save registers, in the benchmark
> > using caller saved registers is much better).
> > Sorry, I may not have been clear in
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
> >
>
> My patch doesn't change the codegen for that code as shown by
Real benchmark scenarios are a little more complex, the testcase in
the PR is just one of the scenes, but not all.
We are currently investigating this case and hope to find a better solution.
>
> commit 846837c2406ae7a52d9123b29c13e4b8b9d14224
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date: Fri Feb 7 13:49:30 2025 +0800
>
> x86: Verify that PUSH/POP can be skipped
>
>
> --
> H.J.
On 2/7/25 12:18 PM, Richard Sandiford wrote:
> FWIW, here's a very rough initial version of the kind of thing
> I was thinking about. Hopefully the hook documentation describes
> the approach. It's deliberately (overly?) flexible.
>
> I've included an aarch64 version that (a) models the fact that the
> first caller-save can also allocate the frame more-or-less for free,
> and (b) once we've saved an odd number of GPRs, saving one more is
> essentialy free. I also hacked up an x86 version locally to model
> the allocation benefits of using caller-saved registers. It seemed
> to fix the povray example above.
>
> This still needs a lot of clean-up and testing, but I thought I might
> as well send what I have before leaving for the weekend. Does it look
> reasonable in principle?
>
Richard, thank you for continuing work on this problem. These hooks and
their implementation have much more sense to me. Although it is
difficult to predict that it will solve all existing related PRs. You
definitely get my approval of your hooks if you will manage not to have
new GCC testsuite failures with these hooks on x86-64, aarch64, and ppc64.
Vladimir Makarov <vmakarov@redhat.com> writes:
> On 2/7/25 12:18 PM, Richard Sandiford wrote:
>> FWIW, here's a very rough initial version of the kind of thing
>> I was thinking about. Hopefully the hook documentation describes
>> the approach. It's deliberately (overly?) flexible.
>>
>> I've included an aarch64 version that (a) models the fact that the
>> first caller-save can also allocate the frame more-or-less for free,
>> and (b) once we've saved an odd number of GPRs, saving one more is
>> essentialy free. I also hacked up an x86 version locally to model
>> the allocation benefits of using caller-saved registers. It seemed
>> to fix the povray example above.
>>
>> This still needs a lot of clean-up and testing, but I thought I might
>> as well send what I have before leaving for the weekend. Does it look
>> reasonable in principle?
>>
> Richard, thank you for continuing work on this problem. These hooks and
> their implementation have much more sense to me. Although it is
> difficult to predict that it will solve all existing related PRs. You
> definitely get my approval of your hooks if you will manage not to have
> new GCC testsuite failures with these hooks on x86-64, aarch64, and ppc64.
Thanks Vlad! Here's an updated patch that passes testing aarch64-linux-gnu
and x86_64-linux-gnu. I haven't yet checked ppc64, but will do that.
Just wanted to post what I have before going off on a long weekend.
As described below, the patch also shows no change to AArch64 SPEC2017
scores. I'm afraid I'll need help from x86 folks to do performance
testing there.
Richard
From 46ad583e65a1c5a27e2203a7571bba6eb0766bc6 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Fri, 7 Feb 2025 15:40:21 +0000
Subject: [PATCH] ira: Add new hooks for callee-save vs spills [PR117477]
To: gcc-patches@gcc.gnu.org
Following on from the discussion in:
https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675256.html
this patch removes TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE and
replaces it with two hooks: one that controls the cost of using an
extra callee-saved register and one that controls the cost of allocating
a frame for the first spill.
(The patch does not attempt to address the shrink-wrapping part of
the thread above.)
On AArch64, this is enough to fix PR117477, as verified by the new tests.
The patch does not change the SPEC2017 scores. (An earlier version
did regress perlbench, because the aarch64 hook in that version
incorrectly treated call-preserved registers as having the same
cost as call-clobbered registers, even for pseudos that are not live
across a call. Oops.)
The x86 change follows Honza's suggestion of deducting 2 from the
current cost, to model the saving of using push & pop. With the
new hooks, we could instead increase the cost of using a caller-saved
register (i.e. model the extra add and sub), but I haven't tried that.
I did however check that deducting 1 instead of 2 was enough to make
pr91384.c pass for -mabi=32 but not for -mabi=64.
gcc/
PR rtl-optimization/117477
* config/aarch64/aarch64.cc (aarch64_count_saves): New function.
(aarch64_count_above_hard_fp_saves, aarch64_callee_save_cost)
(aarch64_frame_allocation_cost): Likewise.
(TARGET_CALLEE_SAVE_COST): Define.
(TARGET_FRAME_ALLOCATION_COST): Likewise.
* config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
Replace with...
(ix86_callee_save_cost): ...this new hook.
(TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Delete.
(TARGET_CALLEE_SAVE_COST): Define.
* target.h (spill_cost_type, frame_cost_type): New enums.
* target.def (callee_save_cost, frame_allocation_cost): New hooks.
(ira_callee_saved_register_cost_scale): Delete.
* doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Delete.
(TARGET_CALLEE_SAVE_COST, TARGET_FRAME_ALLOCATION_COST): New hooks.
* doc/tm.texi: Regenerate.
* hard-reg-set.h (hard_reg_set_popcount): New function.
* ira-color.cc (allocated_memory_p): New variable.
(allocated_callee_save_regs): Likewise.
(record_allocation): New function.
(assign_hard_reg): Use targetm.frame_allocation_cost to model
the cost of the first spill or first caller save. Use
targetm.callee_save_cost to model the cost of using new callee-saved
registers. Apply the exit rather than entry frequency to the cost
of restoring a register or deallocating the frame. Update the
new variables above.
(improve_allocation): Use record_allocation.
(color): Initialize allocated_callee_save_regs.
(ira_color): Initialize allocated_memory_p.
* targhooks.h (default_callee_save_cost): Declare.
(default_frame_allocation_cost): Likewise.
* targhooks.cc (default_callee_save_cost): New function.
(default_frame_allocation_cost): Likewise.
gcc/testsuite/
PR rtl-optimization/117477
* gcc.target/aarch64/callee_save_1.c: New test.
* gcc.target/aarch64/callee_save_2.c: Likewise.
* gcc.target/aarch64/callee_save_3.c: Likewise.
---
gcc/config/aarch64/aarch64.cc | 118 ++++++++++++++++++
gcc/config/i386/i386.cc | 14 ++-
gcc/doc/tm.texi | 77 ++++++++++--
gcc/doc/tm.texi.in | 6 +-
gcc/hard-reg-set.h | 15 +++
gcc/ira-color.cc | 83 ++++++++++--
gcc/target.def | 87 +++++++++++--
gcc/target.h | 12 ++
gcc/targhooks.cc | 27 ++++
gcc/targhooks.h | 5 +
.../gcc.target/aarch64/callee_save_1.c | 12 ++
.../gcc.target/aarch64/callee_save_2.c | 14 +++
.../gcc.target/aarch64/callee_save_3.c | 12 ++
13 files changed, 445 insertions(+), 37 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/callee_save_1.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/callee_save_2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/callee_save_3.c
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f5f23f6ff4b..03ea15ec88f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15873,6 +15873,118 @@ aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
: base + aarch64_tune_params.memmov_cost.store_int);
}
+/* CALLEE_SAVED_REGS is the set of callee-saved registers that the
+ RA has already decided to use. Return the total number of registers
+ in class RCLASS that need to be saved and restored, including the
+ frame link registers. */
+static int
+aarch64_count_saves (const HARD_REG_SET &callee_saved_regs, reg_class rclass)
+{
+ auto saved_gprs = callee_saved_regs & reg_class_contents[rclass];
+ auto nregs = hard_reg_set_popcount (saved_gprs);
+
+ if (TEST_HARD_REG_BIT (reg_class_contents[rclass], LR_REGNUM))
+ {
+ if (aarch64_needs_frame_chain ())
+ nregs += 2;
+ else if (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM))
+ nregs += 1;
+ }
+ return nregs;
+}
+
+/* CALLEE_SAVED_REGS is the set of callee-saved registers that the
+ RA has already decided to use. Return the total number of registers
+ that need to be saved above the hard frame pointer, including the
+ frame link registers. */
+static int
+aarch64_count_above_hard_fp_saves (const HARD_REG_SET &callee_saved_regs)
+{
+ /* FP and Advanced SIMD registers are saved above the frame pointer
+ but SVE registers are saved below it. */
+ if (known_le (GET_MODE_SIZE (aarch64_reg_save_mode (V8_REGNUM)), 16U))
+ return aarch64_count_saves (callee_saved_regs, POINTER_AND_FP_REGS);
+ return aarch64_count_saves (callee_saved_regs, POINTER_REGS);
+}
+
+/* Implement TARGET_CALLEE_SAVE_COST. */
+static int
+aarch64_callee_save_cost (spill_cost_type spill_type, unsigned int regno,
+ machine_mode mode, unsigned int nregs, int mem_cost,
+ const HARD_REG_SET &callee_saved_regs,
+ bool existing_spill_p)
+{
+ /* If we've already committed to saving an odd number of GPRs, assume that
+ saving one more will involve turning an STR into an STP and an LDR
+ into an LDP. This should still be more expensive than not spilling
+ (meaning that the minimum cost is 2), but it should usually be cheaper
+ than a separate store or load. */
+ if (GP_REGNUM_P (regno)
+ && nregs == 1
+ && (aarch64_count_saves (callee_saved_regs, GENERAL_REGS) & 1))
+ return 2;
+
+ /* Similarly for saving FP registers, if we only need to save the low
+ 64 bits. (We can also use STP/LDP instead of STR/LDR for Q registers,
+ but that is less likely to be a saving.) */
+ if (FP_REGNUM_P (regno)
+ && nregs == 1
+ && known_eq (GET_MODE_SIZE (aarch64_reg_save_mode (regno)), 8U)
+ && (aarch64_count_saves (callee_saved_regs, FP_REGS) & 1))
+ return 2;
+
+ /* If this would be the first register that we save, add the cost of
+ allocating or deallocating the frame. For GPR, FPR, and Advanced SIMD
+ saves, the allocation and deallocation can be folded into the save and
+ restore. */
+ if (!existing_spill_p
+ && !GP_REGNUM_P (regno)
+ && !(FP_REGNUM_P (regno)
+ && known_le (GET_MODE_SIZE (aarch64_reg_save_mode (regno)), 16U)))
+ return default_callee_save_cost (spill_type, regno, mode, nregs, mem_cost,
+ callee_saved_regs, existing_spill_p);
+
+ return mem_cost;
+}
+
+/* Implement TARGET_FRAME_ALLOCATION_COST. */
+static int
+aarch64_frame_allocation_cost (frame_cost_type,
+ const HARD_REG_SET &callee_saved_regs)
+{
+ /* The intention is to model the relative costs of different approaches
+ to storing data on the stack, rather than to model the cost of saving
+ data vs not saving it. This means that we should return 0 if:
+
+ - any frame is going to be allocated with:
+
+ stp x29, x30, [sp, #-...]!
+
+ to create a frame link.
+
+ - any frame is going to be allocated with:
+
+ str x30, [sp, #-...]!
+
+ to save the link register.
+
+ In both cases, the allocation and deallocation instructions are the
+ same however we store data to the stack. (In the second case, the STR
+ could be converted to an STP by saving an extra call-preserved register,
+ but that is modeled by aarch64_callee_save_cost.)
+
+ In other cases, assume that a frame would need to be allocated with a
+ separate subtraction and deallocated with a separate addition. Saves
+ of call-clobbered registers can then reclaim this cost using a
+ predecrement store and a postincrement load.
+
+ For simplicity, give this addition or subtraction the same cost as
+ a GPR move. We could parameterize this if necessary. */
+ if (aarch64_count_above_hard_fp_saves (callee_saved_regs) != 0)
+ return aarch64_tune_params.regmove_cost->GP2GP;
+ return 0;
+}
+
/* Implement TARGET_INSN_COST. We have the opportunity to do something
much more productive here, such as using insn attributes to cost things.
But we don't, not yet.
@@ -31575,6 +31687,12 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_MEMORY_MOVE_COST
#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
+#undef TARGET_CALLEE_SAVE_COST
+#define TARGET_CALLEE_SAVE_COST aarch64_callee_save_cost
+
+#undef TARGET_FRAME_ALLOCATION_COST
+#define TARGET_FRAME_ALLOCATION_COST aarch64_frame_allocation_cost
+
#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3128973ba79..919d7d9941c 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20600,12 +20600,15 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
return false;
}
-/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
+/* Implement TARGET_CALLEE_SAVE_COST. */
static int
-ix86_ira_callee_saved_register_cost_scale (int)
+ix86_callee_save_cost (spill_cost_type, unsigned int, machine_mode,
+ unsigned int, int mem_cost, const HARD_REG_SET &, bool)
{
- return 1;
+ /* Account for the fact that push and pop are shorter and do their
+ own allocation and deallocation. */
+ return mem_cost - 2;
}
/* Return true if a set of DST by the expression SRC should be allowed.
@@ -27086,9 +27089,8 @@ ix86_libgcc_floating_mode_supported_p
#define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
#undef TARGET_CLASS_LIKELY_SPILLED_P
#define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
-#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
-#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
- ix86_ira_callee_saved_register_cost_scale
+#undef TARGET_CALLEE_SAVE_COST
+#define TARGET_CALLEE_SAVE_COST ix86_callee_save_cost
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 9f42913a4ef..a96700c0d38 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -3047,14 +3047,6 @@ A target hook which can change allocno class for given pseudo from
The default version of this target hook always returns given class.
@end deftypefn
-@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
-A target hook which returns the callee-saved register @var{hard_regno}
-cost scale in epilogue and prologue used by IRA.
-
-The default version of this target hook returns 1 if optimizing for
-size, otherwise returns the entry block frequency.
-@end deftypefn
-
@deftypefn {Target Hook} bool TARGET_LRA_P (void)
A target hook which returns true if we use LRA instead of reload pass.
@@ -7011,6 +7003,75 @@ value to the result of that function. The arguments to that function
are the same as to this target hook.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_CALLEE_SAVE_COST (spill_cost_type @var{cost_type}, unsigned int @var{hard_regno}, machine_mode @var{mode}, unsigned int @var{nregs}, int @var{mem_cost}, const HARD_REG_SET @var{&allocated_callee_regs}, bool @var{existing_spills_p})
+Return the one-off cost of saving or restoring callee-saved registers
+(also known as call-preserved registers or non-volatile registers).
+The parameters are as follows:
+
+@itemize
+@item
+@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register
+and @samp{spill_cost_type::RESTORE} for restoring a register.
+
+@item
+@var{hard_regno} and @var{mode} represent the whole register that
+the register allocator is considering using; of these,
+@var{nregs} registers are fully or partially callee-saved.
+
+@item
+@var{mem_cost} is the normal cost for storing (for saves)
+or loading (for restores) the @var{nregs} registers.
+
+@item
+@var{allocated_callee_regs} is the set of callee-saved registers
+that are already in use.
+
+@item
+@var{existing_spills_p} is true if the register allocator has
+already decided to spill registers to memory.
+@end itemize
+
+If @var{existing_spills_p} is false, the cost of a save should account
+for frame allocations in a way that is consistent with
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for spills.
+Similarly, the cost of a restore should then account for frame deallocations
+in a way that is consistent with @code{TARGET_FRAME_ALLOCATION_COST}'s
+handling of deallocations.
+
+Note that this hook should not attempt to apply a frequency scale
+to the cost: it is the caller's responsibility to do that where
+appropriate.
+
+The default implementation returns @var{mem_cost}, plus the allocation
+or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},
+where appropriate.
+@end deftypefn
+
+@deftypefn {Target Hook} int TARGET_FRAME_ALLOCATION_COST (frame_cost_type @var{cost_type}, const HARD_REG_SET @var{&allocated_callee_regs})
+Return the cost of allocating or deallocating a frame for the sake of
+a spill; @var{cost_type} chooses between allocation and deallocation.
+The term ``spill'' here includes both forcing a pseudo register to memory
+and using caller-saved registers for pseudo registers that are live across
+a call.
+
+This hook is only called if the register allocator has not so far
+decided to spill. The allocator may have decided to use callee-saved
+registers; if so, @var{allocated_callee_regs} is the set of callee-saved
+registers that the allocator has used. There might also be other reasons
+why a stack frame is already needed; for example, @samp{get_frame_size ()}
+might be nonzero, or the target might already require a frame for
+target-specific reasons.
+
+When the register allocator uses this hook to cost spills, it also uses
+@code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved registers, passing
+@samp{false} as the @var{existing_spills_p} argument. The intention is to
+allow the target to apply an apples-for-apples comparison between the
+cost of using callee-saved registers and using spills in cases where the
+allocator has not yet committed to using both strategies.
+
+The default implementation returns 0.
+@end deftypefn
+
@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
A C expression for the cost of a branch instruction. A value of 1 is
the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 6dbe22581ca..eccc4d88493 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -2388,8 +2388,6 @@ in the reload pass.
@hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
-@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
-
@hook TARGET_LRA_P
@hook TARGET_REGISTER_PRIORITY
@@ -4584,6 +4582,10 @@ These macros are obsolete, new ports should use the target hook
@hook TARGET_MEMORY_MOVE_COST
+@hook TARGET_CALLEE_SAVE_COST
+
+@hook TARGET_FRAME_ALLOCATION_COST
+
@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
A C expression for the cost of a branch instruction. A value of 1 is
the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/hard-reg-set.h b/gcc/hard-reg-set.h
index 48025d202b6..0d03aed5128 100644
--- a/gcc/hard-reg-set.h
+++ b/gcc/hard-reg-set.h
@@ -191,6 +191,12 @@ hard_reg_set_empty_p (const_hard_reg_set x)
return x == HARD_CONST (0);
}
+inline int
+hard_reg_set_popcount (const_hard_reg_set x)
+{
+ return popcount_hwi (x);
+}
+
#else
inline void
@@ -254,6 +260,15 @@ hard_reg_set_empty_p (const_hard_reg_set x)
bad |= x.elts[i];
return bad == 0;
}
+
+inline int
+hard_reg_set_popcount (const_hard_reg_set x)
+{
+ int count = 0;
+ for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
+ count += popcount_hwi (x.elts[i]);
+ return count;
+}
#endif
/* Iterator for hard register sets. */
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 233060e1587..3472541459f 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -1195,10 +1195,16 @@ finish_update_cost_records (void)
update_cost_record_pool.release ();
}
+/* True if we have allocated memory, or intend to do so. */
+static bool allocated_memory_p;
+
/* Array whose element value is TRUE if the corresponding hard
register was already allocated for an allocno. */
static bool allocated_hardreg_p[FIRST_PSEUDO_REGISTER];
+/* Which callee-saved hard registers we've decided to save. */
+static HARD_REG_SET allocated_callee_save_regs;
+
/* Describes one element in a queue of allocnos whose costs need to be
updated. Each allocno in the queue is known to have an allocno
class. */
@@ -1740,6 +1746,20 @@ check_hard_reg_p (ira_allocno_t a, int hard_regno,
return j == nregs;
}
+/* Record that we have allocated NREGS registers starting at HARD_REGNO. */
+
+static void
+record_allocation (int hard_regno, int nregs)
+{
+ for (int i = 0; i < nregs; ++i)
+ if (!allocated_hardreg_p[hard_regno + i])
+ {
+ allocated_hardreg_p[hard_regno + i] = true;
+ if (!crtl->abi->clobbers_full_reg_p (hard_regno + i))
+ SET_HARD_REG_BIT (allocated_callee_save_regs, hard_regno + i);
+ }
+}
+
/* Return number of registers needed to be saved and restored at
function prologue/epilogue if we allocate HARD_REGNO to hold value
of MODE. */
@@ -1961,6 +1981,12 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
#endif
auto_bitmap allocnos_to_spill;
HARD_REG_SET soft_conflict_regs = {};
+ int entry_freq = REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ int exit_freq = REG_FREQ_FROM_BB (EXIT_BLOCK_PTR_FOR_FN (cfun));
+ int spill_cost = 0;
+ /* Whether we have spilled pseudos or used caller-saved registers for values
+ that are live across a call. */
+ bool existing_spills_p = allocated_memory_p || caller_save_needed;
ira_assert (! ALLOCNO_ASSIGNED_P (a));
get_conflict_and_start_profitable_regs (a, retry_p,
@@ -1979,6 +2005,18 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
start_update_cost ();
mem_cost += ALLOCNO_UPDATED_MEMORY_COST (a);
+ if (!existing_spills_p)
+ {
+ auto entry_cost = targetm.frame_allocation_cost
+ (frame_cost_type::ALLOCATION, allocated_callee_save_regs);
+ spill_cost += entry_cost * entry_freq;
+
+ auto exit_cost = targetm.frame_allocation_cost
+ (frame_cost_type::DEALLOCATION, allocated_callee_save_regs);
+ spill_cost += exit_cost * exit_freq;
+ }
+ mem_cost += spill_cost;
+
ira_allocate_and_copy_costs (&ALLOCNO_UPDATED_HARD_REG_COSTS (a),
aclass, ALLOCNO_HARD_REG_COSTS (a));
a_costs = ALLOCNO_UPDATED_HARD_REG_COSTS (a);
@@ -2175,16 +2213,37 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
/* We need to save/restore the hard register in
epilogue/prologue. Therefore we increase the cost. */
{
+ int nregs = hard_regno_nregs (hard_regno, mode);
+ add_cost = 0;
rclass = REGNO_REG_CLASS (hard_regno);
- add_cost = ((ira_memory_move_cost[mode][rclass][0]
- + ira_memory_move_cost[mode][rclass][1])
- * saved_nregs / hard_regno_nregs (hard_regno,
- mode) - 1)
- * targetm.ira_callee_saved_register_cost_scale (hard_regno);
+
+ auto entry_cost = targetm.callee_save_cost
+ (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
+ ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
+ allocated_callee_save_regs, existing_spills_p);
+ /* In the event of a tie between caller-save and callee-save,
+ prefer callee-save. We apply this to the entry cost rather
+ than the exit cost since the entry frequency must be at
+ least as high as the exit frequency. */
+ if (entry_cost > 0)
+ entry_cost -= 1;
+ add_cost += entry_cost * entry_freq;
+
+ auto exit_cost = targetm.callee_save_cost
+ (spill_cost_type::RESTORE, hard_regno, mode, saved_nregs,
+ ira_memory_move_cost[mode][rclass][1] * saved_nregs / nregs,
+ allocated_callee_save_regs, existing_spills_p);
+ add_cost += exit_cost * exit_freq;
+
cost += add_cost;
full_cost += add_cost;
}
}
+ if (ira_need_caller_save_p (a, hard_regno))
+ {
+ cost += spill_cost;
+ full_cost += spill_cost;
+ }
if (min_cost > cost)
min_cost = cost;
if (min_full_cost > full_cost)
@@ -2211,11 +2270,13 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
fail:
if (best_hard_regno >= 0)
{
- for (i = hard_regno_nregs (best_hard_regno, mode) - 1; i >= 0; i--)
- allocated_hardreg_p[best_hard_regno + i] = true;
+ record_allocation (best_hard_regno,
+ hard_regno_nregs (best_hard_regno, mode));
spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs,
best_hard_regno);
}
+ else
+ allocated_memory_p = true;
if (! retry_p)
restore_costs_from_copies (a);
ALLOCNO_HARD_REGNO (a) = best_hard_regno;
@@ -3368,8 +3429,7 @@ improve_allocation (void)
/* Assign the best chosen hard register to A. */
ALLOCNO_HARD_REGNO (a) = best;
- for (j = nregs - 1; j >= 0; j--)
- allocated_hardreg_p[best + j] = true;
+ record_allocation (best, nregs);
if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
fprintf (ira_dump_file, "Assigning %d to a%dr%d\n",
@@ -5199,6 +5259,7 @@ color (void)
{
allocno_stack_vec.create (ira_allocnos_num);
memset (allocated_hardreg_p, 0, sizeof (allocated_hardreg_p));
+ CLEAR_HARD_REG_SET (allocated_callee_save_regs);
ira_initiate_assign ();
do_coloring ();
ira_finish_assign ();
@@ -5327,10 +5388,14 @@ ira_color (void)
ira_allocno_iterator ai;
/* Setup updated costs. */
+ allocated_memory_p = false;
FOR_EACH_ALLOCNO (a, ai)
{
ALLOCNO_UPDATED_MEMORY_COST (a) = ALLOCNO_MEMORY_COST (a);
ALLOCNO_UPDATED_CLASS_COST (a) = ALLOCNO_CLASS_COST (a);
+ if (ALLOCNO_CLASS (a) == NO_REGS
+ && !ira_equiv_no_lvalue_p (ALLOCNO_REGNO (a)))
+ allocated_memory_p = true;
}
if (ira_conflicts_p)
color ();
diff --git a/gcc/target.def b/gcc/target.def
index c348b15815a..6c7cdc8126b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3775,6 +3775,81 @@ are the same as to this target hook.",
int, (machine_mode mode, reg_class_t rclass, bool in),
default_memory_move_cost)
+DEFHOOK
+(callee_save_cost,
+ "Return the one-off cost of saving or restoring callee-saved registers\n\
+(also known as call-preserved registers or non-volatile registers).\n\
+The parameters are as follows:\n\
+\n\
+@itemize\n\
+@item\n\
+@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register\n\
+and @samp{spill_cost_type::RESTORE} for restoring a register.\n\
+\n\
+@item\n\
+@var{hard_regno} and @var{mode} represent the whole register that\n\
+the register allocator is considering using; of these,\n\
+@var{nregs} registers are fully or partially callee-saved.\n\
+\n\
+@item\n\
+@var{mem_cost} is the normal cost for storing (for saves)\n\
+or loading (for restores) the @var{nregs} registers.\n\
+\n\
+@item\n\
+@var{allocated_callee_regs} is the set of callee-saved registers\n\
+that are already in use.\n\
+\n\
+@item\n\
+@var{existing_spills_p} is true if the register allocator has\n\
+already decided to spill registers to memory.\n\
+@end itemize\n\
+\n\
+If @var{existing_spills_p} is false, the cost of a save should account\n\
+for frame allocations in a way that is consistent with\n\
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for spills.\n\
+Similarly, the cost of a restore should then account for frame deallocations\n\
+in a way that is consistent with @code{TARGET_FRAME_ALLOCATION_COST}'s\n\
+handling of deallocations.\n\
+\n\
+Note that this hook should not attempt to apply a frequency scale\n\
+to the cost: it is the caller's responsibility to do that where\n\
+appropriate.\n\
+\n\
+The default implementation returns @var{mem_cost}, plus the allocation\n\
+or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},\n\
+where appropriate.",
+ int, (spill_cost_type cost_type, unsigned int hard_regno,
+ machine_mode mode, unsigned int nregs, int mem_cost,
+ const HARD_REG_SET &allocated_callee_regs, bool existing_spills_p),
+ default_callee_save_cost)
+
+DEFHOOK
+(frame_allocation_cost,
+ "Return the cost of allocating or deallocating a frame for the sake of\n\
+a spill; @var{cost_type} chooses between allocation and deallocation.\n\
+The term ``spill'' here includes both forcing a pseudo register to memory\n\
+and using caller-saved registers for pseudo registers that are live across\n\
+a call.\n\
+\n\
+This hook is only called if the register allocator has not so far\n\
+decided to spill. The allocator may have decided to use callee-saved\n\
+registers; if so, @var{allocated_callee_regs} is the set of callee-saved\n\
+registers that the allocator has used. There might also be other reasons\n\
+why a stack frame is already needed; for example, @samp{get_frame_size ()}\n\
+might be nonzero, or the target might already require a frame for\n\
+target-specific reasons.\n\
+\n\
+When the register allocator uses this hook to cost spills, it also uses\n\
+@code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved registers, passing\n\
+@samp{false} as the @var{existing_spills_p} argument. The intention is to\n\
+allow the target to apply an apples-for-apples comparison between the\n\
+cost of using callee-saved registers and using spills in cases where the\n\
+allocator has not yet committed to using both strategies.\n\
+\n\
+The default implementation returns 0.",
+ int, (frame_cost_type cost_type, const HARD_REG_SET &allocated_callee_regs),
+ default_frame_allocation_cost)
+
DEFHOOK
(use_by_pieces_infrastructure_p,
"GCC will attempt several strategies when asked to copy between\n\
@@ -5714,18 +5789,6 @@ DEFHOOK
reg_class_t, (int, reg_class_t, reg_class_t),
default_ira_change_pseudo_allocno_class)
-/* Scale of callee-saved register cost in epilogue and prologue used by
- IRA. */
-DEFHOOK
-(ira_callee_saved_register_cost_scale,
- "A target hook which returns the callee-saved register @var{hard_regno}\n\
-cost scale in epilogue and prologue used by IRA.\n\
-\n\
-The default version of this target hook returns 1 if optimizing for\n\
-size, otherwise returns the entry block frequency.",
- int, (int hard_regno),
- default_ira_callee_saved_register_cost_scale)
-
/* Return true if we use LRA instead of reload. */
DEFHOOK
(lra_p,
diff --git a/gcc/target.h b/gcc/target.h
index 3e1ee68a341..2bf35e2d0ee 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -284,6 +284,18 @@ enum poly_value_estimate_kind
POLY_VALUE_LIKELY
};
+enum class spill_cost_type
+{
+ SAVE,
+ RESTORE
+};
+
+enum class frame_cost_type
+{
+ ALLOCATION,
+ DEALLOCATION
+};
+
typedef void (*emit_support_tinfos_callback) (tree);
extern bool verify_type_context (location_t, type_context_kind, const_tree,
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 344075efa41..c79458e374e 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2083,6 +2083,33 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
#endif
}
+/* The default implementation of TARGET_CALLEE_SAVE_COST. */
+
+int
+default_callee_save_cost (spill_cost_type spill_type, unsigned int,
+ machine_mode, unsigned int, int mem_cost,
+ const HARD_REG_SET &callee_saved_regs,
+ bool existing_spills_p)
+{
+ if (!existing_spills_p)
+ {
+ auto frame_type = (spill_type == spill_cost_type::SAVE
+ ? frame_cost_type::ALLOCATION
+ : frame_cost_type::DEALLOCATION);
+ mem_cost += targetm.frame_allocation_cost (frame_type,
+ callee_saved_regs);
+ }
+ return mem_cost;
+}
+
+/* The default implementation of TARGET_FRAME_ALLOCATION_COST. */
+
+int
+default_frame_allocation_cost (frame_cost_type, const HARD_REG_SET &)
+{
+ return 0;
+}
+
/* The default implementation of TARGET_SLOW_UNALIGNED_ACCESS. */
bool
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 8871e01430c..f16b58798c2 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -235,6 +235,11 @@ extern tree default_builtin_tm_load_store (tree);
extern int default_memory_move_cost (machine_mode, reg_class_t, bool);
extern int default_register_move_cost (machine_mode, reg_class_t,
reg_class_t);
+extern int default_callee_save_cost (spill_cost_type, unsigned int,
+ machine_mode, unsigned int, int,
+ const HARD_REG_SET &, bool);
+extern int default_frame_allocation_cost (frame_cost_type,
+ const HARD_REG_SET &);
extern bool default_slow_unaligned_access (machine_mode, unsigned int);
extern HOST_WIDE_INT default_estimated_poly_value (poly_int64,
poly_value_estimate_kind);
diff --git a/gcc/testsuite/gcc.target/aarch64/callee_save_1.c b/gcc/testsuite/gcc.target/aarch64/callee_save_1.c
new file mode 100644
index 00000000000..f28486112f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/callee_save_1.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O2" } */
+
+int test (int x), test2 (int x);
+
+int foo (int x, int y) {
+ test (x);
+ int lhs = test2 (y);
+ return x + lhs;
+}
+
+/* { dg-final { scan-assembler {\tstp\tx19, x20, \[sp,} } } */
+/* { dg-final { scan-assembler {\tldp\tx19, x20, \[sp,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/callee_save_2.c b/gcc/testsuite/gcc.target/aarch64/callee_save_2.c
new file mode 100644
index 00000000000..744b464be2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/callee_save_2.c
@@ -0,0 +1,14 @@
+/* { dg-options "-O2 -fomit-frame-pointer" } */
+
+int test (int x), test2 (int x);
+
+int foo (int x, int y) {
+ test (x);
+ int lhs = test2 (y);
+ return x + lhs;
+}
+
+/* { dg-final { scan-assembler {\tstp\tx30, x19, \[sp,} } } */
+/* { dg-final { scan-assembler {\tldp\tx30, x19, \[sp\],} } } */
+/* { dg-final { scan-assembler {\tstr\tw1, \[sp,} } } */
+/* { dg-final { scan-assembler {\tldr\tw0, \[sp,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/callee_save_3.c b/gcc/testsuite/gcc.target/aarch64/callee_save_3.c
new file mode 100644
index 00000000000..50b6853e4ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/callee_save_3.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O2" } */
+
+float test ();
+float g;
+
+float foo (float x, float y) {
+ g = x + test ();
+ return (x + test ()) * y;
+}
+
+/* { dg-final { scan-assembler {\tstp\td14, d15, \[sp,} } } */
+/* { dg-final { scan-assembler {\tldp\td14, d15, \[sp,} } } */
> As described below, the patch also shows no change to AArch64 SPEC2017
> scores. I'm afraid I'll need help from x86 folks to do performance
> testing there.
I will look into this over weekend. I can write x86 version of the
hooks. Though in earlier email you mentioned you hacked up something, so
if you can share it with me, perhaps I can start from there.
> +/* Implement TARGET_CALLEE_SAVE_COST. */
> +static int
> +aarch64_callee_save_cost (spill_cost_type spill_type, unsigned int regno,
> + machine_mode mode, unsigned int nregs, int mem_cost,
> + const HARD_REG_SET &callee_saved_regs,
> + bool existing_spill_p)
> +{
I suppose here we only want to adjust load or save cost by 1 to account
for shorter push/pop operation...
> +
> +/* Implement TARGET_FRAME_ALLOCATION_COST. */
> +static int
> +aarch64_frame_allocation_cost (frame_cost_type,
> + const HARD_REG_SET &callee_saved_regs)
And here see if the register save area will need allocation...
So it seems relatively easy to do...
Honza
On 2/13/25 11:08 AM, Richard Sandiford wrote:
> From 46ad583e65a1c5a27e2203a7571bba6eb0766bc6 Mon Sep 17 00:00:00 2001
> From: Richard Sandiford <richard.sandiford@arm.com>
> Date: Fri, 7 Feb 2025 15:40:21 +0000
> Subject: [PATCH] ira: Add new hooks for callee-save vs spills [PR117477]
> To: gcc-patches@gcc.gnu.org
>
> Following on from the discussion in:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675256.html
>
> this patch removes TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE and
> replaces it with two hooks: one that controls the cost of using an
> extra callee-saved register and one that controls the cost of allocating
> a frame for the first spill.
>
> (The patch does not attempt to address the shrink-wrapping part of
> the thread above.)
>
> On AArch64, this is enough to fix PR117477, as verified by the new tests.
> The patch does not change the SPEC2017 scores. (An earlier version
> did regress perlbench, because the aarch64 hook in that version
> incorrectly treated call-preserved registers as having the same
> cost as call-clobbered registers, even for pseudos that are not live
> across a call. Oops.)
>
> The x86 change follows Honza's suggestion of deducting 2 from the
> current cost, to model the saving of using push & pop. With the
> new hooks, we could instead increase the cost of using a caller-saved
> register (i.e. model the extra add and sub), but I haven't tried that.
> I did however check that deducting 1 instead of 2 was enough to make
> pr91384.c pass for -mabi=32 but not for -mabi=64.
>
> gcc/
> PR rtl-optimization/117477
> * config/aarch64/aarch64.cc (aarch64_count_saves): New function.
> (aarch64_count_above_hard_fp_saves, aarch64_callee_save_cost)
> (aarch64_frame_allocation_cost): Likewise.
> (TARGET_CALLEE_SAVE_COST): Define.
> (TARGET_FRAME_ALLOCATION_COST): Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> Replace with...
> (ix86_callee_save_cost): ...this new hook.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Delete.
> (TARGET_CALLEE_SAVE_COST): Define.
> * target.h (spill_cost_type, frame_cost_type): New enums.
> * target.def (callee_save_cost, frame_allocation_cost): New hooks.
> (ira_callee_saved_register_cost_scale): Delete.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Delete.
> (TARGET_CALLEE_SAVE_COST, TARGET_FRAME_ALLOCATION_COST): New hooks.
> * doc/tm.texi: Regenerate.
> * hard-reg-set.h (hard_reg_set_popcount): New function.
> * ira-color.cc (allocated_memory_p): New variable.
> (allocated_callee_save_regs): Likewise.
> (record_allocation): New function.
> (assign_hard_reg): Use targetm.frame_allocation_cost to model
> the cost of the first spill or first caller save. Use
> targetm.callee_save_cost to model the cost of using new callee-saved
> registers. Apply the exit rather than entry frequency to the cost
> of restoring a register or deallocating the frame. Update the
> new variables above.
> (improve_allocation): Use record_allocation.
> (color): Initialize allocated_callee_save_regs.
> (ira_color): Initialize allocated_memory_p.
> * targhooks.h (default_callee_save_cost): Declare.
> (default_frame_allocation_cost): Likewise.
> * targhooks.cc (default_callee_save_cost): New function.
> (default_frame_allocation_cost): Likewise.
>
> gcc/testsuite/
> PR rtl-optimization/117477
> * gcc.target/aarch64/callee_save_1.c: New test.
> * gcc.target/aarch64/callee_save_2.c: Likewise.
> * gcc.target/aarch64/callee_save_3.c: Likewise.
The patch is very well described and it is OK for me to commit it into
the trunk. Thank you for working on this issue, Richard.
If we have some new failures on targets I believe the hook has enough
descriptive power to fix the failures.
On 2/14/25 10:43 AM, Vladimir Makarov wrote:
> The patch is very well described and it is OK for me to commit it into the
> trunk. Thank you for working on this issue, Richard.
>
> If we have some new failures on targets I believe the hook has enough
> descriptive power to fix the failures.
Note that H.J. pushed the other target hook, so we'll need to revert
that before pushing this change.
Peter
On 2/14/25 12:27 PM, Peter Bergner wrote:
> On 2/14/25 10:43 AM, Vladimir Makarov wrote:
>> The patch is very well described and it is OK for me to commit it into the
>> trunk. Thank you for working on this issue, Richard.
>>
>> If we have some new failures on targets I believe the hook has enough
>> descriptive power to fix the failures.
> Note that H.J. pushed the other target hook, so we'll need to revert
> that before pushing this change.
>
Richard's patch already contains removing HJ's patch.
Jan Hubicka <hubicka@ucw.cz> writes:
>> As described below, the patch also shows no change to AArch64 SPEC2017
>> scores. I'm afraid I'll need help from x86 folks to do performance
>> testing there.
>
> I will look into this over weekend. I can write x86 version of the
> hooks. Though in earlier email you mentioned you hacked up something, so
> if you can share it with me, perhaps I can start from there.
Sorry, was away until today. But the earlier hacked up version was
essentially what I included in this patch.
Thanks,
Richard
> Jan Hubicka <hubicka@ucw.cz> writes:
> >> As described below, the patch also shows no change to AArch64 SPEC2017
> >> scores. I'm afraid I'll need help from x86 folks to do performance
> >> testing there.
> >
> > I will look into this over weekend. I can write x86 version of the
> > hooks. Though in earlier email you mentioned you hacked up something, so
> > if you can share it with me, perhaps I can start from there.
>
> Sorry, was away until today. But the earlier hacked up version was
> essentially what I included in this patch.
I added few special cases to that (since we don't always push) but
overall it looks good to me. I hope to have benchmarks done today
(I was benchmarking some other patches over weekend).
Honza
>
> Thanks,
> Richard
Hello,
I looked into updating the hook
> -/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +/* Implement TARGET_CALLEE_SAVE_COST. */
>
> static int
> -ix86_ira_callee_saved_register_cost_scale (int)
> +ix86_callee_save_cost (spill_cost_type, unsigned int, machine_mode,
> + unsigned int, int mem_cost, const HARD_REG_SET &, bool)
> {
> - return 1;
> + /* Account for the fact that push and pop are shorter and do their
> + own allocation and deallocation. */
> + return mem_cost - 2;
> }
I think this is fine for usual performance metrics of push/pop. For
size we now end up with cost of 0, which is likely not right, so I added
a special case and return 1. Size costs do not quite correspond to
mov-mov sizes, so I will try to fix it and see if that results in better
code size.
I also added a test that regno in question is integer registers. While
we do not callee save XMM for the defualt ABI, Microsoft version does.
I am not sure how push2 and pushp extensions comes into game, but we can
do that once we have hardward to test.
Concerning x86 specifics, there is cost for allocating stack frame. So
if the function has nothing on stack frame push/pop becomes bit better
candidate then a spill. The hook you added does not seem to be able to
test this, since it does not have frame size as an parameter. I wonder
if there is easy way to get it in?
Also for old CPUs with no stack prediction engine we split either one or
two push instructions into adjustemnet+move pair. I do not see how to
put that into game, since the cost of 1 or 2 reigsters then differs from
3 or more, but also I think we do not need to care about this, since all
reaosnably current CPUs have stack prediction.
I am benchmarking updated patch and will send once it is done.
Honza
Jan Hubicka <hubicka@ucw.cz> writes:
> Concerning x86 specifics, there is cost for allocating stack frame. So
> if the function has nothing on stack frame push/pop becomes bit better
> candidate then a spill. The hook you added does not seem to be able to
> test this, since it does not have frame size as an parameter. I wonder
> if there is easy way to get it in?
The main frame size is available globally as get_frame_size ().
There's also the question of whether a frame needs to be created
for other reasons, such as an alloca call, but I suppose setting
up a frame for just alloca would also use push on x86?
> Also for old CPUs with no stack prediction engine we split either one or
> two push instructions into adjustemnet+move pair. I do not see how to
> put that into game, since the cost of 1 or 2 reigsters then differs from
> 3 or more, but also I think we do not need to care about this, since all
> reaosnably current CPUs have stack prediction.
Yeah. The hook does allow you test how many registers have been pushed,
and how many will be pushed after the change that is being costed.
But giving a higher cost for the first two registers would probably
tend to penalise using callee-saved registers for the first few allocnos
that we colour, which are also likely to be the most important allocnos.
Trying to cost the difference might therefore be counter-productive.
> I am benchmarking updated patch and will send once it is done.
Thanks!
Richard
> Jan Hubicka <hubicka@ucw.cz> writes:
> > Concerning x86 specifics, there is cost for allocating stack frame. So
> > if the function has nothing on stack frame push/pop becomes bit better
> > candidate then a spill. The hook you added does not seem to be able to
> > test this, since it does not have frame size as an parameter. I wonder
> > if there is easy way to get it in?
>
> The main frame size is available globally as get_frame_size ().
> There's also the question of whether a frame needs to be created
> for other reasons, such as an alloca call, but I suppose setting
> up a frame for just alloca would also use push on x86?
Usually the frame is first created by push/pop instructions (which are
callee saves and possibly frame pointer) and the remaining capacity is
allocated using add/sub of ESP pointer. If these can be avoided we save
about 8 bytes of code. Performance wise the stack engine will likely
completely hide the overhead of extra add/sub.
We need add/sub for caller saves, spilling and on-stack variables.
We may be able to hide it in red-zone, but only for leafs.
get_frame_size I think only tells me about hte on-stack variables at the
time ira-color is performed.
This is something that would be nice to model better, but also is likely
not critical. So I only mentioned it in case you or Vladimir can come
up with a nice way to fit this in.
>
> > Also for old CPUs with no stack prediction engine we split either one or
> > two push instructions into adjustemnet+move pair. I do not see how to
> > put that into game, since the cost of 1 or 2 reigsters then differs from
> > 3 or more, but also I think we do not need to care about this, since all
> > reaosnably current CPUs have stack prediction.
>
> Yeah. The hook does allow you test how many registers have been pushed,
> and how many will be pushed after the change that is being costed.
> But giving a higher cost for the first two registers would probably
> tend to penalise using callee-saved registers for the first few allocnos
> that we colour, which are also likely to be the most important allocnos.
> Trying to cost the difference might therefore be counter-productive.
Actually my memory got this backwards. While I experimented by avoiding
only some push/pop instructions on CPUs w/o stack engine (those were
produced before 2003) it is not in mainline. All we do is the oposite
conversion. Sometimes we turn sub/add of ESP into shorter but more
expensive push or pop. This may be accounted in frame allocation cost,
but again, it is only about extra old CPUs.
Honza
>
> > I am benchmarking updated patch and will send once it is done.
>
> Thanks!
>
> Richard
Hi,
this is a variant of a hook I benchmarked on cpu2016 with -Ofast -flto
and -O2 -flto. For non -Os and no Windows ABI should be pratically the
same as your variant that was simply returning mem_cost - 2.
It seems mostly SPEC netural. With -O2 -flto there is
small 4% improvement on povray (which was mentioned earlier) and also
5% regression on perlbench.
I will check to see if I can figure out what is going out with
perlbench. However I relalized that -flto is probably hidding some of
differences becuase of cross-module inlining and IPA-RA, so I am
retesting with -O2 alone and -O2 -fno-ipa-ra to stress the costs little
more.
I also noticed that move costs for -Os are not really set according to
size of the instructions, so I will experiment with fixing that
incrementally.
Honza
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 560e6525b56..3d09448c326 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20713,12 +20713,27 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
return false;
}
-/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
+/* Implement TARGET_CALLEE_SAVE_COST. */
static int
-ix86_ira_callee_saved_register_cost_scale (int)
-{
- return 1;
+ix86_callee_save_cost (spill_cost_type, unsigned int hard_regno, machine_mode,
+ unsigned int, int mem_cost, const HARD_REG_SET &, bool)
+{
+ /* Account for the fact that push and pop are shorter and do their
+ own allocation and deallocation. */
+ if (GENERAL_REGNO_P (hard_regno))
+ {
+ /* push is 1 byte while typical spill is 4-5 bytes.
+ ??? We probably should adjust size costs accordingly.
+ Costs are relative to reg-reg move that has 2 bytes for 32bit
+ and 3 bytes otherwise. */
+ if (optimize_function_for_size_p (cfun))
+ return 1;
+ /* Be sure that no cost table sets cost to 2, so we end up with 0. */
+ gcc_checking_assert (mem_cost > 2);
+ return mem_cost - 2;
+ }
+ return mem_cost;
}
/* Return true if a set of DST by the expression SRC should be allowed.
@@ -27199,9 +27214,8 @@ ix86_libgcc_floating_mode_supported_p
#define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
#undef TARGET_CLASS_LIKELY_SPILLED_P
#define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
-#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
-#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
- ix86_ira_callee_saved_register_cost_scale
+#undef TARGET_CALLEE_SAVE_COST
+#define TARGET_CALLEE_SAVE_COST ix86_callee_save_cost
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
On Wed, Feb 19, 2025 at 9:06 PM Jan Hubicka <hubicka@ucw.cz> wrote:
>
> Hi,
> this is a variant of a hook I benchmarked on cpu2016 with -Ofast -flto
> and -O2 -flto. For non -Os and no Windows ABI should be pratically the
> same as your variant that was simply returning mem_cost - 2.
>
I've tested O2/(Ofast march=native) with SPEC2017 on SPR, mostly
neutral (small improvement on povray).
> It seems mostly SPEC netural. With -O2 -flto there is
> small 4% improvement on povray (which was mentioned earlier) and also
> 5% regression on perlbench.
>
> I will check to see if I can figure out what is going out with
> perlbench. However I relalized that -flto is probably hidding some of
> differences becuase of cross-module inlining and IPA-RA, so I am
> retesting with -O2 alone and -O2 -fno-ipa-ra to stress the costs little
> more.
>
> I also noticed that move costs for -Os are not really set according to
> size of the instructions, so I will experiment with fixing that
> incrementally.
>
> Honza
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 560e6525b56..3d09448c326 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20713,12 +20713,27 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> -/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +/* Implement TARGET_CALLEE_SAVE_COST. */
>
> static int
> -ix86_ira_callee_saved_register_cost_scale (int)
> -{
> - return 1;
> +ix86_callee_save_cost (spill_cost_type, unsigned int hard_regno, machine_mode,
> + unsigned int, int mem_cost, const HARD_REG_SET &, bool)
> +{
> + /* Account for the fact that push and pop are shorter and do their
> + own allocation and deallocation. */
> + if (GENERAL_REGNO_P (hard_regno))
> + {
> + /* push is 1 byte while typical spill is 4-5 bytes.
> + ??? We probably should adjust size costs accordingly.
> + Costs are relative to reg-reg move that has 2 bytes for 32bit
> + and 3 bytes otherwise. */
> + if (optimize_function_for_size_p (cfun))
> + return 1;
> + /* Be sure that no cost table sets cost to 2, so we end up with 0. */
> + gcc_checking_assert (mem_cost > 2);
> + return mem_cost - 2;
> + }
> + return mem_cost;
> }
>
> /* Return true if a set of DST by the expression SRC should be allowed.
> @@ -27199,9 +27214,8 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> -#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> -#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> - ix86_ira_callee_saved_register_cost_scale
> +#undef TARGET_CALLEE_SAVE_COST
> +#define TARGET_CALLEE_SAVE_COST ix86_callee_save_cost
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> On Wed, Feb 19, 2025 at 9:06 PM Jan Hubicka <hubicka@ucw.cz> wrote:
> >
> > Hi,
> > this is a variant of a hook I benchmarked on cpu2016 with -Ofast -flto
> > and -O2 -flto. For non -Os and no Windows ABI should be pratically the
> > same as your variant that was simply returning mem_cost - 2.
> >
> I've tested O2/(Ofast march=native) with SPEC2017 on SPR, mostly
> neutral (small improvement on povray).
So I got ryzen3 runs with -O2, -O3 and -fno-ipa-ra.
Overall differences are quite small, but I think it is expected. Here is
what I get with -O2:
--------------- ------- --------- --------- ------- --------- ---------
500.perlbench_r 1 188 8.46 S 1 183 8.69 S
500.perlbench_r 1 187 8.52 * 1 182 8.75 *
500.perlbench_r 1 186 8.56 S 1 182 8.75 S
502.gcc_r 1 139 10.2 S 1 137 10.3 *
502.gcc_r 1 139 10.2 S 1 137 10.4 S
502.gcc_r 1 139 10.2 * 1 137 10.3 S
505.mcf_r 1 187 8.66 * 1 188 8.61 S
505.mcf_r 1 186 8.70 S 1 187 8.66 *
505.mcf_r 1 188 8.62 S 1 187 8.66 S
520.omnetpp_r 1 213 6.15 * 1 207 6.32 *
520.omnetpp_r 1 212 6.18 S 1 206 6.37 S
520.omnetpp_r 1 219 5.99 S 1 215 6.11 S
523.xalancbmk_r 1 -- CE 1 -- CE
525.x264_r 1 135 13.0 S 1 135 12.9 *
525.x264_r 1 135 13.0 * 1 135 12.9 S
525.x264_r 1 135 13.0 S 1 135 12.9 S
531.deepsjeng_r 1 167 6.86 * 1 167 6.85 S
531.deepsjeng_r 1 167 6.86 S 1 168 6.84 S
531.deepsjeng_r 1 167 6.86 S 1 167 6.85 *
541.leela_r 1 296 5.60 S 1 292 5.67 *
541.leela_r 1 293 5.65 S 1 293 5.65 S
541.leela_r 1 296 5.60 * 1 292 5.67 S
548.exchange2_r 1 208 12.6 S 1 208 12.6 S
548.exchange2_r 1 208 12.6 * 1 208 12.6 S
548.exchange2_r 1 208 12.6 S 1 208 12.6 *
557.xz_r 1 194 5.58 S 1 193 5.58 S
557.xz_r 1 192 5.62 S 1 193 5.60 S
557.xz_r 1 193 5.60 * 1 193 5.59 *
=================================================================================
500.perlbench_r 1 187 8.52 * 1 182 8.75 *
502.gcc_r 1 139 10.2 * 1 137 10.3 *
505.mcf_r 1 187 8.66 * 1 187 8.66 *
520.omnetpp_r 1 213 6.15 * 1 207 6.32 *
523.xalancbmk_r NR NR
525.x264_r 1 135 13.0 * 1 135 12.9 *
531.deepsjeng_r 1 167 6.86 * 1 167 6.85 *
541.leela_r 1 296 5.60 * 1 292 5.67 *
548.exchange2_r 1 208 12.6 * 1 208 12.6 *
557.xz_r 1 193 5.60 * 1 193 5.59 *
Est. SPECrate2017_int_base 8.17
Est. SPECrate2017_int_peak 8.24
Perlbench seems to improve consistently without LTO (bot -O2, -O3 and
-O2 -fno-ipa-ra and I think it may be just a luck with code layout
gcc is quie concistent in all settings. Overall it seems consistent
little win. For fp tests, I see only off-noise povray differences and only in
-Ofast and -Ofast -flto.
Comparing code sizes at -O2:
500.perlbench_r/run/run_base_refrate_regalloc-m64.0000/perlbench_r_base.regalloc-m64 1699987 1731648 101.86
502.gcc_r/run/run_base_refrate_regalloc-m64.0000/cpugcc_r_base.regalloc-m64 7072031 7226911 102.19
503.bwaves_r/run/run_base_refrate_regalloc-m64.0000/bwaves_r_base.regalloc-m64 41327 41327 100.00
505.mcf_r/run/run_base_refrate_regalloc-m64.0000/mcf_r_base.regalloc-m64 17023 17023 100.00
507.cactuBSSN_r/run/run_base_refrate_regalloc-m64.0000/cactusBSSN_r_base.regalloc-m64 3432326 3464950 100.95
508.namd_r/run/run_base_refrate_regalloc-m64.0000/namd_r_base.regalloc-m64 835954 835457 99.94
510.parest_r/run/run_base_refrate_regalloc-m64.0000/parest_r_base.regalloc-m64 7498066 7587378 101.19
511.povray_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_511_base.regalloc-m64 18206 18222 100.08
511.povray_r/run/run_base_refrate_regalloc-m64.0000/povray_r_base.regalloc-m64 754591 761695 100.94
519.lbm_r/run/run_base_refrate_regalloc-m64.0000/lbm_r_base.regalloc-m64 10900 10916 100.14
520.omnetpp_r/run/run_base_refrate_regalloc-m64.0000/omnetpp_r_base.regalloc-m64 1403348 1425556 101.58
521.wrf_r/run/run_base_refrate_regalloc-m64.0000/diffwrf_521_base.regalloc-m64 16388136 16394552 100.03
521.wrf_r/run/run_base_refrate_regalloc-m64.0000/wrf_r_base.regalloc-m64 22293527 22302167 100.03
525.x264_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_525_base.regalloc-m64 18206 18222 100.08
525.x264_r/run/run_base_refrate_regalloc-m64.0000/ldecod_r_base.regalloc-m64 398564 401667 100.77
525.x264_r/run/run_base_refrate_regalloc-m64.0000/x264_r_base.regalloc-m64 405515 407051 100.37
526.blender_r/run/run_base_refrate_regalloc-m64.0000/blender_r_base.regalloc-m64 7567792 7631536 100.84
526.blender_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_526_base.regalloc-m64 18206 18222 100.08
527.cam4_r/run/run_base_refrate_regalloc-m64.0000/cam4_r_base.regalloc-m64 5957695 5969535 100.19
527.cam4_r/run/run_base_refrate_regalloc-m64.0000/cam4_validate_527_base.regalloc-m64 606591 608767 100.35
531.deepsjeng_r/run/run_base_refrate_regalloc-m64.0000/deepsjeng_r_base.regalloc-m64 75304 76248 101.25
538.imagick_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_538_base.regalloc-m64 18206 18222 100.08
538.imagick_r/run/run_base_refrate_regalloc-m64.0000/imagick_r_base.regalloc-m64 1638858 1651628 100.77
541.leela_r/run/run_base_refrate_regalloc-m64.0000/leela_r_base.regalloc-m64 132636 133146 100.38
544.nab_r/run/run_base_refrate_regalloc-m64.0000/nab_r_base.regalloc-m64 150146 150513 100.24
548.exchange2_r/run/run_base_refrate_regalloc-m64.0000/exchange2_r_base.regalloc-m64 76709 76709 100.00
549.fotonik3d_r/run/run_base_refrate_regalloc-m64.0000/fotonik3d_r_base.regalloc-m64 464940 465260 100.06
554.roms_r/run/run_base_refrate_regalloc-m64.0000/roms_r_base.regalloc-m64 833926 834166 100.02
557.xz_r/run/run_base_refrate_regalloc-m64.0000/xz_r_base.regalloc-m64 130345 133253 102.23
The 2% code size increase for gcc as not very nice, but I think also
expected, since we make compiler to use less push/pop instructions.
There are 34091 push instructions with patch and 38939 without.
With -fno-ipa-ra the story is similar:
500.perlbench_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/perlbench_r_base.regalloc-O2-noipara-m64 1701299 1733024 101.86
502.gcc_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cpugcc_r_base.regalloc-O2-noipara-m64 7074527 7229855 102.19
503.bwaves_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/bwaves_r_base.regalloc-O2-noipara-m64 41327 41327 100.00
505.mcf_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/mcf_r_base.regalloc-O2-noipara-m64 17151 17151 100.00
507.cactuBSSN_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cactusBSSN_r_base.regalloc-O2-noipara-m64 3432326 3464950 100.95
508.namd_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/namd_r_base.regalloc-O2-noipara-m64 835954 835457 99.94
510.parest_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/parest_r_base.regalloc-O2-noipara-m64 7504722 7594098 101.19
511.povray_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_511_base.regalloc-O2-noipara-m64 18206 18222 100.08
511.povray_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/povray_r_base.regalloc-O2-noipara-m64 756639 763487 100.90
519.lbm_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/lbm_r_base.regalloc-O2-noipara-m64 10900 10916 100.14
520.omnetpp_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/omnetpp_r_base.regalloc-O2-noipara-m64 1403412 1425748 101.59
521.wrf_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/diffwrf_521_base.regalloc-O2-noipara-m64 16394344 16400504 100.03
521.wrf_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/wrf_r_base.regalloc-O2-noipara-m64 22300503 22308759 100.03
525.x264_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_525_base.regalloc-O2-noipara-m64 18206 18222 100.08
525.x264_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/ldecod_r_base.regalloc-O2-noipara-m64 399204 402179 100.74
525.x264_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/x264_r_base.regalloc-O2-noipara-m64 406251 408299 100.50
526.blender_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/blender_r_base.regalloc-O2-noipara-m64 7583536 7648304 100.85
526.blender_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_526_base.regalloc-O2-noipara-m64 18206 18222 100.08
527.cam4_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cam4_r_base.regalloc-O2-noipara-m64 5962335 5974239 100.19
527.cam4_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cam4_validate_527_base.regalloc-O2-noipara-m64 607327 609375 100.33
531.deepsjeng_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/deepsjeng_r_base.regalloc-O2-noipara-m64 75240 76248 101.33
538.imagick_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_538_base.regalloc-O2-noipara-m64 18206 18222 100.08
538.imagick_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagick_r_base.regalloc-O2-noipara-m64 1641226 1654060 100.78
541.leela_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/leela_r_base.regalloc-O2-noipara-m64 132764 133274 100.38
544.nab_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/nab_r_base.regalloc-O2-noipara-m64 150498 150929 100.28
548.exchange2_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/exchange2_r_base.regalloc-O2-noipara-m64 76921 76921 100.00
549.fotonik3d_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/fotonik3d_r_base.regalloc-O2-noipara-m64 464940 465260 100.06
554.roms_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/roms_r_base.regalloc-O2-noipara-m64 833926 834166 100.02
557.xz_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/xz_r_base.regalloc-O2-noipara-m64 130697 133573 102.20
Overall I think new costing works reasonably well.
Honza
Jan Hubicka <hubicka@ucw.cz> writes:
>> On Wed, Feb 19, 2025 at 9:06 PM Jan Hubicka <hubicka@ucw.cz> wrote:
>> >
>> > Hi,
>> > this is a variant of a hook I benchmarked on cpu2016 with -Ofast -flto
>> > and -O2 -flto. For non -Os and no Windows ABI should be pratically the
>> > same as your variant that was simply returning mem_cost - 2.
>> >
>> I've tested O2/(Ofast march=native) with SPEC2017 on SPR, mostly
>> neutral (small improvement on povray).
>
> So I got ryzen3 runs with -O2, -O3 and -fno-ipa-ra.
>
> Overall differences are quite small, but I think it is expected. Here is
> what I get with -O2:
> --------------- ------- --------- --------- ------- --------- ---------
> 500.perlbench_r 1 188 8.46 S 1 183 8.69 S
> 500.perlbench_r 1 187 8.52 * 1 182 8.75 *
> 500.perlbench_r 1 186 8.56 S 1 182 8.75 S
> 502.gcc_r 1 139 10.2 S 1 137 10.3 *
> 502.gcc_r 1 139 10.2 S 1 137 10.4 S
> 502.gcc_r 1 139 10.2 * 1 137 10.3 S
> 505.mcf_r 1 187 8.66 * 1 188 8.61 S
> 505.mcf_r 1 186 8.70 S 1 187 8.66 *
> 505.mcf_r 1 188 8.62 S 1 187 8.66 S
> 520.omnetpp_r 1 213 6.15 * 1 207 6.32 *
> 520.omnetpp_r 1 212 6.18 S 1 206 6.37 S
> 520.omnetpp_r 1 219 5.99 S 1 215 6.11 S
> 523.xalancbmk_r 1 -- CE 1 -- CE
> 525.x264_r 1 135 13.0 S 1 135 12.9 *
> 525.x264_r 1 135 13.0 * 1 135 12.9 S
> 525.x264_r 1 135 13.0 S 1 135 12.9 S
> 531.deepsjeng_r 1 167 6.86 * 1 167 6.85 S
> 531.deepsjeng_r 1 167 6.86 S 1 168 6.84 S
> 531.deepsjeng_r 1 167 6.86 S 1 167 6.85 *
> 541.leela_r 1 296 5.60 S 1 292 5.67 *
> 541.leela_r 1 293 5.65 S 1 293 5.65 S
> 541.leela_r 1 296 5.60 * 1 292 5.67 S
> 548.exchange2_r 1 208 12.6 S 1 208 12.6 S
> 548.exchange2_r 1 208 12.6 * 1 208 12.6 S
> 548.exchange2_r 1 208 12.6 S 1 208 12.6 *
> 557.xz_r 1 194 5.58 S 1 193 5.58 S
> 557.xz_r 1 192 5.62 S 1 193 5.60 S
> 557.xz_r 1 193 5.60 * 1 193 5.59 *
> =================================================================================
> 500.perlbench_r 1 187 8.52 * 1 182 8.75 *
> 502.gcc_r 1 139 10.2 * 1 137 10.3 *
> 505.mcf_r 1 187 8.66 * 1 187 8.66 *
> 520.omnetpp_r 1 213 6.15 * 1 207 6.32 *
> 523.xalancbmk_r NR NR
> 525.x264_r 1 135 13.0 * 1 135 12.9 *
> 531.deepsjeng_r 1 167 6.86 * 1 167 6.85 *
> 541.leela_r 1 296 5.60 * 1 292 5.67 *
> 548.exchange2_r 1 208 12.6 * 1 208 12.6 *
> 557.xz_r 1 193 5.60 * 1 193 5.59 *
> Est. SPECrate2017_int_base 8.17
> Est. SPECrate2017_int_peak 8.24
>
> Perlbench seems to improve consistently without LTO (bot -O2, -O3 and
> -O2 -fno-ipa-ra and I think it may be just a luck with code layout
> gcc is quie concistent in all settings. Overall it seems consistent
> little win. For fp tests, I see only off-noise povray differences and only in
> -Ofast and -Ofast -flto.
>
> Comparing code sizes at -O2:
>
> 500.perlbench_r/run/run_base_refrate_regalloc-m64.0000/perlbench_r_base.regalloc-m64 1699987 1731648 101.86
> 502.gcc_r/run/run_base_refrate_regalloc-m64.0000/cpugcc_r_base.regalloc-m64 7072031 7226911 102.19
> 503.bwaves_r/run/run_base_refrate_regalloc-m64.0000/bwaves_r_base.regalloc-m64 41327 41327 100.00
> 505.mcf_r/run/run_base_refrate_regalloc-m64.0000/mcf_r_base.regalloc-m64 17023 17023 100.00
> 507.cactuBSSN_r/run/run_base_refrate_regalloc-m64.0000/cactusBSSN_r_base.regalloc-m64 3432326 3464950 100.95
> 508.namd_r/run/run_base_refrate_regalloc-m64.0000/namd_r_base.regalloc-m64 835954 835457 99.94
> 510.parest_r/run/run_base_refrate_regalloc-m64.0000/parest_r_base.regalloc-m64 7498066 7587378 101.19
> 511.povray_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_511_base.regalloc-m64 18206 18222 100.08
> 511.povray_r/run/run_base_refrate_regalloc-m64.0000/povray_r_base.regalloc-m64 754591 761695 100.94
> 519.lbm_r/run/run_base_refrate_regalloc-m64.0000/lbm_r_base.regalloc-m64 10900 10916 100.14
> 520.omnetpp_r/run/run_base_refrate_regalloc-m64.0000/omnetpp_r_base.regalloc-m64 1403348 1425556 101.58
> 521.wrf_r/run/run_base_refrate_regalloc-m64.0000/diffwrf_521_base.regalloc-m64 16388136 16394552 100.03
> 521.wrf_r/run/run_base_refrate_regalloc-m64.0000/wrf_r_base.regalloc-m64 22293527 22302167 100.03
> 525.x264_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_525_base.regalloc-m64 18206 18222 100.08
> 525.x264_r/run/run_base_refrate_regalloc-m64.0000/ldecod_r_base.regalloc-m64 398564 401667 100.77
> 525.x264_r/run/run_base_refrate_regalloc-m64.0000/x264_r_base.regalloc-m64 405515 407051 100.37
> 526.blender_r/run/run_base_refrate_regalloc-m64.0000/blender_r_base.regalloc-m64 7567792 7631536 100.84
> 526.blender_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_526_base.regalloc-m64 18206 18222 100.08
> 527.cam4_r/run/run_base_refrate_regalloc-m64.0000/cam4_r_base.regalloc-m64 5957695 5969535 100.19
> 527.cam4_r/run/run_base_refrate_regalloc-m64.0000/cam4_validate_527_base.regalloc-m64 606591 608767 100.35
> 531.deepsjeng_r/run/run_base_refrate_regalloc-m64.0000/deepsjeng_r_base.regalloc-m64 75304 76248 101.25
> 538.imagick_r/run/run_base_refrate_regalloc-m64.0000/imagevalidate_538_base.regalloc-m64 18206 18222 100.08
> 538.imagick_r/run/run_base_refrate_regalloc-m64.0000/imagick_r_base.regalloc-m64 1638858 1651628 100.77
> 541.leela_r/run/run_base_refrate_regalloc-m64.0000/leela_r_base.regalloc-m64 132636 133146 100.38
> 544.nab_r/run/run_base_refrate_regalloc-m64.0000/nab_r_base.regalloc-m64 150146 150513 100.24
> 548.exchange2_r/run/run_base_refrate_regalloc-m64.0000/exchange2_r_base.regalloc-m64 76709 76709 100.00
> 549.fotonik3d_r/run/run_base_refrate_regalloc-m64.0000/fotonik3d_r_base.regalloc-m64 464940 465260 100.06
> 554.roms_r/run/run_base_refrate_regalloc-m64.0000/roms_r_base.regalloc-m64 833926 834166 100.02
> 557.xz_r/run/run_base_refrate_regalloc-m64.0000/xz_r_base.regalloc-m64 130345 133253 102.23
>
> The 2% code size increase for gcc as not very nice, but I think also
> expected, since we make compiler to use less push/pop instructions.
> There are 34091 push instructions with patch and 38939 without.
>
> With -fno-ipa-ra the story is similar:
>
> 500.perlbench_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/perlbench_r_base.regalloc-O2-noipara-m64 1701299 1733024 101.86
> 502.gcc_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cpugcc_r_base.regalloc-O2-noipara-m64 7074527 7229855 102.19
> 503.bwaves_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/bwaves_r_base.regalloc-O2-noipara-m64 41327 41327 100.00
> 505.mcf_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/mcf_r_base.regalloc-O2-noipara-m64 17151 17151 100.00
> 507.cactuBSSN_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cactusBSSN_r_base.regalloc-O2-noipara-m64 3432326 3464950 100.95
> 508.namd_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/namd_r_base.regalloc-O2-noipara-m64 835954 835457 99.94
> 510.parest_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/parest_r_base.regalloc-O2-noipara-m64 7504722 7594098 101.19
> 511.povray_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_511_base.regalloc-O2-noipara-m64 18206 18222 100.08
> 511.povray_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/povray_r_base.regalloc-O2-noipara-m64 756639 763487 100.90
> 519.lbm_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/lbm_r_base.regalloc-O2-noipara-m64 10900 10916 100.14
> 520.omnetpp_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/omnetpp_r_base.regalloc-O2-noipara-m64 1403412 1425748 101.59
> 521.wrf_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/diffwrf_521_base.regalloc-O2-noipara-m64 16394344 16400504 100.03
> 521.wrf_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/wrf_r_base.regalloc-O2-noipara-m64 22300503 22308759 100.03
> 525.x264_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_525_base.regalloc-O2-noipara-m64 18206 18222 100.08
> 525.x264_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/ldecod_r_base.regalloc-O2-noipara-m64 399204 402179 100.74
> 525.x264_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/x264_r_base.regalloc-O2-noipara-m64 406251 408299 100.50
> 526.blender_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/blender_r_base.regalloc-O2-noipara-m64 7583536 7648304 100.85
> 526.blender_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_526_base.regalloc-O2-noipara-m64 18206 18222 100.08
> 527.cam4_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cam4_r_base.regalloc-O2-noipara-m64 5962335 5974239 100.19
> 527.cam4_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/cam4_validate_527_base.regalloc-O2-noipara-m64 607327 609375 100.33
> 531.deepsjeng_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/deepsjeng_r_base.regalloc-O2-noipara-m64 75240 76248 101.33
> 538.imagick_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagevalidate_538_base.regalloc-O2-noipara-m64 18206 18222 100.08
> 538.imagick_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/imagick_r_base.regalloc-O2-noipara-m64 1641226 1654060 100.78
> 541.leela_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/leela_r_base.regalloc-O2-noipara-m64 132764 133274 100.38
> 544.nab_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/nab_r_base.regalloc-O2-noipara-m64 150498 150929 100.28
> 548.exchange2_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/exchange2_r_base.regalloc-O2-noipara-m64 76921 76921 100.00
> 549.fotonik3d_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/fotonik3d_r_base.regalloc-O2-noipara-m64 464940 465260 100.06
> 554.roms_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/roms_r_base.regalloc-O2-noipara-m64 833926 834166 100.02
> 557.xz_r/run/run_base_refrate_regalloc-O2-noipara-m64.0000/xz_r_base.regalloc-O2-noipara-m64 130697 133573 102.20
>
> Overall I think new costing works reasonably well.
Thanks for running these. I saw poor results for perlbench with my
initial aarch64 hooks because the hooks reduced the cost to zero for
the entry case:
auto entry_cost = targetm.callee_save_cost
(spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
allocated_callee_save_regs, existing_spills_p);
/* In the event of a tie between caller-save and callee-save,
prefer callee-save. We apply this to the entry cost rather
than the exit cost since the entry frequency must be at
least as high as the exit frequency. */
if (entry_cost > 0)
entry_cost -= 1;
I "fixed" that by bumping the cost to a minimum of 2, but I was
wondering whether the "entry_cost > 0" should instead be "entry_cost > 1",
so that the cost is always greater than not using a callee save for
registers that don't cross a call. WDYT?
Richard
>
> Thanks for running these. I saw poor results for perlbench with my
> initial aarch64 hooks because the hooks reduced the cost to zero for
> the entry case:
>
> auto entry_cost = targetm.callee_save_cost
> (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
> ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
> allocated_callee_save_regs, existing_spills_p);
> /* In the event of a tie between caller-save and callee-save,
> prefer callee-save. We apply this to the entry cost rather
> than the exit cost since the entry frequency must be at
> least as high as the exit frequency. */
> if (entry_cost > 0)
> entry_cost -= 1;
>
> I "fixed" that by bumping the cost to a minimum of 2, but I was
> wondering whether the "entry_cost > 0" should instead be "entry_cost > 1",
> so that the cost is always greater than not using a callee save for
> registers that don't cross a call. WDYT?
For x86 perfomance costs, the push cost should be memory_move_cost which
is 6, -2 for adjustment in the target hook and -1 for this. So cost
should not be 0 I think.
For size cost, I currently return 1, so we indeed get 0 after
adjustment.
I think cost of 0 will make us to pick callee save even if caller save
is available and there are no function calls, so I guess we do not want
that....
Honza
Jan Hubicka <hubicka@ucw.cz> writes:
>>
>> Thanks for running these. I saw poor results for perlbench with my
>> initial aarch64 hooks because the hooks reduced the cost to zero for
>> the entry case:
>>
>> auto entry_cost = targetm.callee_save_cost
>> (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
>> ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
>> allocated_callee_save_regs, existing_spills_p);
>> /* In the event of a tie between caller-save and callee-save,
>> prefer callee-save. We apply this to the entry cost rather
>> than the exit cost since the entry frequency must be at
>> least as high as the exit frequency. */
>> if (entry_cost > 0)
>> entry_cost -= 1;
>>
>> I "fixed" that by bumping the cost to a minimum of 2, but I was
>> wondering whether the "entry_cost > 0" should instead be "entry_cost > 1",
>> so that the cost is always greater than not using a callee save for
>> registers that don't cross a call. WDYT?
>
> For x86 perfomance costs, the push cost should be memory_move_cost which
> is 6, -2 for adjustment in the target hook and -1 for this. So cost
> should not be 0 I think.
>
> For size cost, I currently return 1, so we indeed get 0 after
> adjustment.
>
> I think cost of 0 will make us to pick callee save even if caller save
> is available and there are no function calls, so I guess we do not want
> that....
OK, here's an updated patch that makes that change. The x86 parts
should be replaced by your patch.
Tested on aarch64-linux-gnu. I also tried to test on pwoerpc64el-linux-gnu
(on gcc112), but I keep getting broken pipes during the test runs,
so I'm struggling to get good before/after comparisons. It does at
least bootstrap though...
I'm going to be away next week. If there's agreement, please feel free
to combine the patch with yours and push it while I'm away.
Thanks,
Richard
@@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
return false;
}
+/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
+
+static int
+ix86_ira_callee_saved_register_cost_scale (int)
+{
+ return 1;
+}
+
/* Return true if a set of DST by the expression SRC should be allowed.
This prevents complex sets of likely_spilled hard regs before split1. */
@@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
#define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
#undef TARGET_CLASS_LIKELY_SPILLED_P
#define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
+#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
+#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
+ ix86_ira_callee_saved_register_cost_scale
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
@@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
The default version of this target hook always returns given class.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
+A target hook which returns the callee-saved register @var{hard_regno}
+cost scale in epilogue and prologue used by IRA.
+
+The default version of this target hook returns 1 if optimizing for
+size, otherwise returns the entry block frequency.
+@end deftypefn
+
@deftypefn {Target Hook} bool TARGET_LRA_P (void)
A target hook which returns true if we use LRA instead of reload pass.
@@ -2388,6 +2388,8 @@ in the reload pass.
@hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
+@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
+
@hook TARGET_LRA_P
@hook TARGET_REGISTER_PRIORITY
@@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
+ ira_memory_move_cost[mode][rclass][1])
* saved_nregs / hard_regno_nregs (hard_regno,
mode) - 1)
- * (optimize_size ? 1 :
- REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+ * targetm.ira_callee_saved_register_cost_scale (hard_regno);
cost += add_cost;
full_cost += add_cost;
}
@@ -5714,6 +5714,18 @@ DEFHOOK
reg_class_t, (int, reg_class_t, reg_class_t),
default_ira_change_pseudo_allocno_class)
+/* Scale of callee-saved register cost in epilogue and prologue used by
+ IRA. */
+DEFHOOK
+(ira_callee_saved_register_cost_scale,
+ "A target hook which returns the callee-saved register @var{hard_regno}\n\
+cost scale in epilogue and prologue used by IRA.\n\
+\n\
+The default version of this target hook returns 1 if optimizing for\n\
+size, otherwise returns the entry block frequency.",
+ int, (int hard_regno),
+ default_ira_callee_saved_register_cost_scale)
+
/* Return true if we use LRA instead of reload. */
DEFHOOK
(lra_p,
@@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
return cl;
}
+int
+default_ira_callee_saved_register_cost_scale (int)
+{
+ return (optimize_size
+ ? 1
+ : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+}
+
extern bool
default_lra_p (void)
{
@@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
reg_class_t);
+extern int default_ira_callee_saved_register_cost_scale (int);
extern bool default_lra_p (void);
extern int default_register_priority (int);
extern bool default_register_usage_leveling_p (void);