[v2] ira: Add a target hook for callee-saved register cost scale
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
Date: Tue Jun 25 08:37:49 2024 -0500
ira: Scale save/restore costs of callee save registers with block frequency
scales the cost of saving/restoring a callee-save hard register in epilogue
and prologue with the entry block frequency, which, if not optimizing for
size, is 10000, for all targets. As the result, callee-saved registers
may not be used to preserve local variable values across calls on some
targets, like x86. Add a target hook for the callee-saved register cost
scale in epilogue and prologue used by IRA. The default version of this
target hook returns 1 if optimizing for size, otherwise returns the entry
block frequency. Add an x86 version of this target hook to restore the
old behavior prior to the above commit.
PR rtl-optimization/111673
PR rtl-optimization/115932
PR rtl-optimization/116028
PR rtl-optimization/117081
PR rtl-optimization/117082
PR rtl-optimization/118497
* ira-color.cc (assign_hard_reg): Call the target hook for the
callee-saved register cost scale in epilogue and prologue.
* target.def (ira_callee_saved_register_cost_scale): New target
hook.
* targhooks.cc (default_ira_callee_saved_register_cost_scale):
New.
* targhooks.h (default_ira_callee_saved_register_cost_scale):
Likewise.
* config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
New.
(TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
* doc/tm.texi: Regenerated.
* doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
New.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
gcc/config/i386/i386.cc | 11 +++++++++++
gcc/doc/tm.texi | 8 ++++++++
gcc/doc/tm.texi.in | 2 ++
gcc/ira-color.cc | 3 +--
gcc/target.def | 12 ++++++++++++
gcc/targhooks.cc | 8 ++++++++
gcc/targhooks.h | 1 +
7 files changed, 43 insertions(+), 2 deletions(-)
Comments
On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets. As the result, callee-saved registers
> may not be used to preserve local variable values across calls on some
> targets, like x86. Add a target hook for the callee-saved register cost
> scale in epilogue and prologue used by IRA. The default version of this
> target hook returns 1 if optimizing for size, otherwise returns the entry
> block frequency. Add an x86 version of this target hook to restore the
> old behavior prior to the above commit.
>
> PR rtl-optimization/111673
> PR rtl-optimization/115932
> PR rtl-optimization/116028
> PR rtl-optimization/117081
> PR rtl-optimization/117082
> PR rtl-optimization/118497
> * ira-color.cc (assign_hard_reg): Call the target hook for the
> callee-saved register cost scale in epilogue and prologue.
> * target.def (ira_callee_saved_register_cost_scale): New target
> hook.
> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> New.
> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> New.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> * doc/tm.texi: Regenerated.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> New.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386.cc | 11 +++++++++++
> gcc/doc/tm.texi | 8 ++++++++
> gcc/doc/tm.texi.in | 2 ++
> gcc/ira-color.cc | 3 +--
> gcc/target.def | 12 ++++++++++++
> gcc/targhooks.cc | 8 ++++++++
> gcc/targhooks.h | 1 +
> 7 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f89201684a8..3128973ba79 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +
> +static int
> +ix86_ira_callee_saved_register_cost_scale (int)
> +{
> + return 1;
> +}
> +
> /* Return true if a set of DST by the expression SRC should be allowed.
> This prevents complex sets of likely_spilled hard regs before split1. */
>
> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> + ix86_ira_callee_saved_register_cost_scale
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..9f42913a4ef 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> The default version of this target hook always returns given class.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> +A target hook which returns the callee-saved register @var{hard_regno}
> +cost scale in epilogue and prologue used by IRA.
> +
> +The default version of this target hook returns 1 if optimizing for
> +size, otherwise returns the entry block frequency.
> +@end deftypefn
> +
> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> A target hook which returns true if we use LRA instead of reload pass.
>
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..6dbe22581ca 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -2388,6 +2388,8 @@ in the reload pass.
>
> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>
> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +
> @hook TARGET_LRA_P
>
> @hook TARGET_REGISTER_PRIORITY
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..233060e1587 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> cost += add_cost;
> full_cost += add_cost;
> }
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..c348b15815a 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -5714,6 +5714,18 @@ DEFHOOK
> reg_class_t, (int, reg_class_t, reg_class_t),
> default_ira_change_pseudo_allocno_class)
>
> +/* Scale of callee-saved register cost in epilogue and prologue used by
> + IRA. */
> +DEFHOOK
> +(ira_callee_saved_register_cost_scale,
> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> +cost scale in epilogue and prologue used by IRA.\n\
> +\n\
> +The default version of this target hook returns 1 if optimizing for\n\
> +size, otherwise returns the entry block frequency.",
> + int, (int hard_regno),
> + default_ira_callee_saved_register_cost_scale)
> +
> /* Return true if we use LRA instead of reload. */
> DEFHOOK
> (lra_p,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..344075efa41 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> return cl;
> }
>
> +int
> +default_ira_callee_saved_register_cost_scale (int)
> +{
> + return (optimize_size
> + ? 1
Why use optimize_size when REG_FREQ_FROM_BB uses
((optimize_function_for_size_p (cfun) \
|| !cfun->cfg->count_max.initialized_p ())
? You are changing behavior for all targets this way? Did you consider
altering REG_FREQ_FROM_BB to
#define REG_FREQ_FROM_BB(bb, size_value)
((optimize_function_for_size_p (cfun) \
|| !cfun->cfg->count_max.initialized_p ()) \
? default \
: ((bb)->count.to_frequency (cfun) \
* REG_FREQ_MAX / BB_FREQ_MAX) \
? ((bb)->count.to_frequency (cfun) \
* REG_FREQ_MAX / BB_FREQ_MAX) \
: 1)
so return a specified value for the optimize_size case? (see the
other thread where
I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
IMO at this point a new target hook should preserve existing behavior by default
or alternatively the original patch should be reverted as causing
regressions and
a new patch introducing the target hook should be installed in next stage1.
Richard.
> + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +}
> +
> extern bool
> default_lra_p (void)
> {
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..8871e01430c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> reg_class_t);
> +extern int default_ira_callee_saved_register_cost_scale (int);
> extern bool default_lra_p (void);
> extern int default_register_priority (int);
> extern bool default_register_usage_leveling_p (void);
> --
> 2.48.1
>
On Mon, Feb 3, 2025 at 5:27 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > Date: Tue Jun 25 08:37:49 2024 -0500
> >
> > ira: Scale save/restore costs of callee save registers with block frequency
> >
> > scales the cost of saving/restoring a callee-save hard register in epilogue
> > and prologue with the entry block frequency, which, if not optimizing for
> > size, is 10000, for all targets. As the result, callee-saved registers
> > may not be used to preserve local variable values across calls on some
> > targets, like x86. Add a target hook for the callee-saved register cost
> > scale in epilogue and prologue used by IRA. The default version of this
> > target hook returns 1 if optimizing for size, otherwise returns the entry
> > block frequency. Add an x86 version of this target hook to restore the
> > old behavior prior to the above commit.
> >
> > PR rtl-optimization/111673
> > PR rtl-optimization/115932
> > PR rtl-optimization/116028
> > PR rtl-optimization/117081
> > PR rtl-optimization/117082
> > PR rtl-optimization/118497
> > * ira-color.cc (assign_hard_reg): Call the target hook for the
> > callee-saved register cost scale in epilogue and prologue.
> > * target.def (ira_callee_saved_register_cost_scale): New target
> > hook.
> > * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> > New.
> > * targhooks.h (default_ira_callee_saved_register_cost_scale):
> > Likewise.
> > * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> > New.
> > (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> > * doc/tm.texi: Regenerated.
> > * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> > New.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > gcc/config/i386/i386.cc | 11 +++++++++++
> > gcc/doc/tm.texi | 8 ++++++++
> > gcc/doc/tm.texi.in | 2 ++
> > gcc/ira-color.cc | 3 +--
> > gcc/target.def | 12 ++++++++++++
> > gcc/targhooks.cc | 8 ++++++++
> > gcc/targhooks.h | 1 +
> > 7 files changed, 43 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index f89201684a8..3128973ba79 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> > return false;
> > }
> >
> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > +
> > +static int
> > +ix86_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return 1;
> > +}
> > +
> > /* Return true if a set of DST by the expression SRC should be allowed.
> > This prevents complex sets of likely_spilled hard regs before split1. */
> >
> > @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> > #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> > #undef TARGET_CLASS_LIKELY_SPILLED_P
> > #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> > +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> > + ix86_ira_callee_saved_register_cost_scale
> >
> > #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> > #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 0de24eda6f0..9f42913a4ef 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> > The default version of this target hook always returns given class.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> > +A target hook which returns the callee-saved register @var{hard_regno}
> > +cost scale in epilogue and prologue used by IRA.
> > +
> > +The default version of this target hook returns 1 if optimizing for
> > +size, otherwise returns the entry block frequency.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> > A target hook which returns true if we use LRA instead of reload pass.
> >
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index 631d04131e3..6dbe22581ca 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -2388,6 +2388,8 @@ in the reload pass.
> >
> > @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> >
> > +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +
> > @hook TARGET_LRA_P
> >
> > @hook TARGET_REGISTER_PRIORITY
> > diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> > index 0699b349a1a..233060e1587 100644
> > --- a/gcc/ira-color.cc
> > +++ b/gcc/ira-color.cc
> > @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> > + ira_memory_move_cost[mode][rclass][1])
> > * saved_nregs / hard_regno_nregs (hard_regno,
> > mode) - 1)
> > - * (optimize_size ? 1 :
> > - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> > cost += add_cost;
> > full_cost += add_cost;
> > }
> > diff --git a/gcc/target.def b/gcc/target.def
> > index 4050b2ebdd4..c348b15815a 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -5714,6 +5714,18 @@ DEFHOOK
> > reg_class_t, (int, reg_class_t, reg_class_t),
> > default_ira_change_pseudo_allocno_class)
> >
> > +/* Scale of callee-saved register cost in epilogue and prologue used by
> > + IRA. */
> > +DEFHOOK
> > +(ira_callee_saved_register_cost_scale,
> > + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> > +cost scale in epilogue and prologue used by IRA.\n\
> > +\n\
> > +The default version of this target hook returns 1 if optimizing for\n\
> > +size, otherwise returns the entry block frequency.",
> > + int, (int hard_regno),
> > + default_ira_callee_saved_register_cost_scale)
> > +
> > /* Return true if we use LRA instead of reload. */
> > DEFHOOK
> > (lra_p,
> > diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> > index f80dc8b2e7e..344075efa41 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> > return cl;
> > }
> >
> > +int
> > +default_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return (optimize_size
> > + ? 1
>
> Why use optimize_size when REG_FREQ_FROM_BB uses
My patch just copies what
commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
Date: Tue Jun 25 08:37:49 2024 -0500
ira: Scale save/restore costs of callee save registers with block frequency
does.
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ())
>
> ? You are changing behavior for all targets this way? Did you consider
> altering REG_FREQ_FROM_BB to
>
> #define REG_FREQ_FROM_BB(bb, size_value)
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ()) \
> ? default \
> : ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> ? ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> : 1)
>
> so return a specified value for the optimize_size case? (see the
> other thread where
> I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
>
> IMO at this point a new target hook should preserve existing behavior by default
> or alternatively the original patch should be reverted as causing
> regressions and
> a new patch introducing the target hook should be installed in next stage1.
I believe the original patch should be reverted. Then my patch isn't needed.
>
> Richard.
>
> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > +}
> > +
> > extern bool
> > default_lra_p (void)
> > {
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> > index 7d15f632c23..8871e01430c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> > extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> > extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> > reg_class_t);
> > +extern int default_ira_callee_saved_register_cost_scale (int);
> > extern bool default_lra_p (void);
> > extern int default_register_priority (int);
> > extern bool default_register_usage_leveling_p (void);
> > --
> > 2.48.1
> >
On Mon, Feb 3, 2025 at 10:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Feb 3, 2025 at 5:27 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> > > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > > Date: Tue Jun 25 08:37:49 2024 -0500
> > >
> > > ira: Scale save/restore costs of callee save registers with block frequency
> > >
> > > scales the cost of saving/restoring a callee-save hard register in epilogue
> > > and prologue with the entry block frequency, which, if not optimizing for
> > > size, is 10000, for all targets. As the result, callee-saved registers
> > > may not be used to preserve local variable values across calls on some
> > > targets, like x86. Add a target hook for the callee-saved register cost
> > > scale in epilogue and prologue used by IRA. The default version of this
> > > target hook returns 1 if optimizing for size, otherwise returns the entry
> > > block frequency. Add an x86 version of this target hook to restore the
> > > old behavior prior to the above commit.
> > >
> > > PR rtl-optimization/111673
> > > PR rtl-optimization/115932
> > > PR rtl-optimization/116028
> > > PR rtl-optimization/117081
> > > PR rtl-optimization/117082
> > > PR rtl-optimization/118497
> > > * ira-color.cc (assign_hard_reg): Call the target hook for the
> > > callee-saved register cost scale in epilogue and prologue.
> > > * target.def (ira_callee_saved_register_cost_scale): New target
> > > hook.
> > > * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> > > New.
> > > * targhooks.h (default_ira_callee_saved_register_cost_scale):
> > > Likewise.
> > > * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> > > New.
> > > (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> > > * doc/tm.texi: Regenerated.
> > > * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> > > New.
> > >
> > > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > > gcc/config/i386/i386.cc | 11 +++++++++++
> > > gcc/doc/tm.texi | 8 ++++++++
> > > gcc/doc/tm.texi.in | 2 ++
> > > gcc/ira-color.cc | 3 +--
> > > gcc/target.def | 12 ++++++++++++
> > > gcc/targhooks.cc | 8 ++++++++
> > > gcc/targhooks.h | 1 +
> > > 7 files changed, 43 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index f89201684a8..3128973ba79 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> > > return false;
> > > }
> > >
> > > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > > +
> > > +static int
> > > +ix86_ira_callee_saved_register_cost_scale (int)
> > > +{
> > > + return 1;
> > > +}
> > > +
> > > /* Return true if a set of DST by the expression SRC should be allowed.
> > > This prevents complex sets of likely_spilled hard regs before split1. */
> > >
> > > @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> > > #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> > > #undef TARGET_CLASS_LIKELY_SPILLED_P
> > > #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> > > +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > > +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> > > + ix86_ira_callee_saved_register_cost_scale
> > >
> > > #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> > > #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > > index 0de24eda6f0..9f42913a4ef 100644
> > > --- a/gcc/doc/tm.texi
> > > +++ b/gcc/doc/tm.texi
> > > @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> > > The default version of this target hook always returns given class.
> > > @end deftypefn
> > >
> > > +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> > > +A target hook which returns the callee-saved register @var{hard_regno}
> > > +cost scale in epilogue and prologue used by IRA.
> > > +
> > > +The default version of this target hook returns 1 if optimizing for
> > > +size, otherwise returns the entry block frequency.
> > > +@end deftypefn
> > > +
> > > @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> > > A target hook which returns true if we use LRA instead of reload pass.
> > >
> > > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > > index 631d04131e3..6dbe22581ca 100644
> > > --- a/gcc/doc/tm.texi.in
> > > +++ b/gcc/doc/tm.texi.in
> > > @@ -2388,6 +2388,8 @@ in the reload pass.
> > >
> > > @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > >
> > > +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > > +
> > > @hook TARGET_LRA_P
> > >
> > > @hook TARGET_REGISTER_PRIORITY
> > > diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> > > index 0699b349a1a..233060e1587 100644
> > > --- a/gcc/ira-color.cc
> > > +++ b/gcc/ira-color.cc
> > > @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> > > + ira_memory_move_cost[mode][rclass][1])
> > > * saved_nregs / hard_regno_nregs (hard_regno,
> > > mode) - 1)
> > > - * (optimize_size ? 1 :
> > > - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > > + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> > > cost += add_cost;
> > > full_cost += add_cost;
> > > }
> > > diff --git a/gcc/target.def b/gcc/target.def
> > > index 4050b2ebdd4..c348b15815a 100644
> > > --- a/gcc/target.def
> > > +++ b/gcc/target.def
> > > @@ -5714,6 +5714,18 @@ DEFHOOK
> > > reg_class_t, (int, reg_class_t, reg_class_t),
> > > default_ira_change_pseudo_allocno_class)
> > >
> > > +/* Scale of callee-saved register cost in epilogue and prologue used by
> > > + IRA. */
> > > +DEFHOOK
> > > +(ira_callee_saved_register_cost_scale,
> > > + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> > > +cost scale in epilogue and prologue used by IRA.\n\
> > > +\n\
> > > +The default version of this target hook returns 1 if optimizing for\n\
> > > +size, otherwise returns the entry block frequency.",
> > > + int, (int hard_regno),
> > > + default_ira_callee_saved_register_cost_scale)
> > > +
> > > /* Return true if we use LRA instead of reload. */
> > > DEFHOOK
> > > (lra_p,
> > > diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> > > index f80dc8b2e7e..344075efa41 100644
> > > --- a/gcc/targhooks.cc
> > > +++ b/gcc/targhooks.cc
> > > @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> > > return cl;
> > > }
> > >
> > > +int
> > > +default_ira_callee_saved_register_cost_scale (int)
> > > +{
> > > + return (optimize_size
> > > + ? 1
> >
> > Why use optimize_size when REG_FREQ_FROM_BB uses
>
> My patch just copies what
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> does.
>
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ())
> >
> > ? You are changing behavior for all targets this way? Did you consider
> > altering REG_FREQ_FROM_BB to
> >
> > #define REG_FREQ_FROM_BB(bb, size_value)
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ()) \
> > ? default \
> > : ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > ? ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > : 1)
> >
> > so return a specified value for the optimize_size case? (see the
> > other thread where
> > I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
> >
> > IMO at this point a new target hook should preserve existing behavior by default
> > or alternatively the original patch should be reverted as causing
> > regressions and
> > a new patch introducing the target hook should be installed in next stage1.
>
> I believe the original patch should be reverted. Then my patch isn't needed.
I'm OK with that, but it's not my call. I do wonder why the contributor did not
address any of the fallout. Maybe he's gone? Peter?
Thanks,
Richard.
>
> >
> > Richard.
> >
> > > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > > +}
> > > +
> > > extern bool
> > > default_lra_p (void)
> > > {
> > > diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> > > index 7d15f632c23..8871e01430c 100644
> > > --- a/gcc/targhooks.h
> > > +++ b/gcc/targhooks.h
> > > @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> > > extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> > > extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> > > reg_class_t);
> > > +extern int default_ira_callee_saved_register_cost_scale (int);
> > > extern bool default_lra_p (void);
> > > extern int default_register_priority (int);
> > > extern bool default_register_usage_leveling_p (void);
> > > --
> > > 2.48.1
> > >
>
>
>
> --
> H.J.
Richard Biener <richard.guenther@gmail.com> writes:
> On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
>> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
>> Date: Tue Jun 25 08:37:49 2024 -0500
>>
>> ira: Scale save/restore costs of callee save registers with block frequency
>>
>> scales the cost of saving/restoring a callee-save hard register in epilogue
>> and prologue with the entry block frequency, which, if not optimizing for
>> size, is 10000, for all targets. As the result, callee-saved registers
>> may not be used to preserve local variable values across calls on some
>> targets, like x86. Add a target hook for the callee-saved register cost
>> scale in epilogue and prologue used by IRA. The default version of this
>> target hook returns 1 if optimizing for size, otherwise returns the entry
>> block frequency. Add an x86 version of this target hook to restore the
>> old behavior prior to the above commit.
>>
>> PR rtl-optimization/111673
>> PR rtl-optimization/115932
>> PR rtl-optimization/116028
>> PR rtl-optimization/117081
>> PR rtl-optimization/117082
>> PR rtl-optimization/118497
>> * ira-color.cc (assign_hard_reg): Call the target hook for the
>> callee-saved register cost scale in epilogue and prologue.
>> * target.def (ira_callee_saved_register_cost_scale): New target
>> hook.
>> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
>> New.
>> * targhooks.h (default_ira_callee_saved_register_cost_scale):
>> Likewise.
>> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
>> New.
>> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
>> * doc/tm.texi: Regenerated.
>> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
>> New.
>>
>> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
>> ---
>> gcc/config/i386/i386.cc | 11 +++++++++++
>> gcc/doc/tm.texi | 8 ++++++++
>> gcc/doc/tm.texi.in | 2 ++
>> gcc/ira-color.cc | 3 +--
>> gcc/target.def | 12 ++++++++++++
>> gcc/targhooks.cc | 8 ++++++++
>> gcc/targhooks.h | 1 +
>> 7 files changed, 43 insertions(+), 2 deletions(-)
>>
>> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
>> index f89201684a8..3128973ba79 100644
>> --- a/gcc/config/i386/i386.cc
>> +++ b/gcc/config/i386/i386.cc
>> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
>> return false;
>> }
>>
>> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
>> +
>> +static int
>> +ix86_ira_callee_saved_register_cost_scale (int)
>> +{
>> + return 1;
>> +}
>> +
>> /* Return true if a set of DST by the expression SRC should be allowed.
>> This prevents complex sets of likely_spilled hard regs before split1. */
>>
>> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
>> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
>> #undef TARGET_CLASS_LIKELY_SPILLED_P
>> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
>> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
>> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
>> + ix86_ira_callee_saved_register_cost_scale
>>
>> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
>> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
>> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
>> index 0de24eda6f0..9f42913a4ef 100644
>> --- a/gcc/doc/tm.texi
>> +++ b/gcc/doc/tm.texi
>> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
>> The default version of this target hook always returns given class.
>> @end deftypefn
>>
>> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
>> +A target hook which returns the callee-saved register @var{hard_regno}
>> +cost scale in epilogue and prologue used by IRA.
>> +
>> +The default version of this target hook returns 1 if optimizing for
>> +size, otherwise returns the entry block frequency.
>> +@end deftypefn
>> +
>> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
>> A target hook which returns true if we use LRA instead of reload pass.
>>
>> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
>> index 631d04131e3..6dbe22581ca 100644
>> --- a/gcc/doc/tm.texi.in
>> +++ b/gcc/doc/tm.texi.in
>> @@ -2388,6 +2388,8 @@ in the reload pass.
>>
>> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>>
>> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
>> +
>> @hook TARGET_LRA_P
>>
>> @hook TARGET_REGISTER_PRIORITY
>> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
>> index 0699b349a1a..233060e1587 100644
>> --- a/gcc/ira-color.cc
>> +++ b/gcc/ira-color.cc
>> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
>> + ira_memory_move_cost[mode][rclass][1])
>> * saved_nregs / hard_regno_nregs (hard_regno,
>> mode) - 1)
>> - * (optimize_size ? 1 :
>> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
>> cost += add_cost;
>> full_cost += add_cost;
>> }
>> diff --git a/gcc/target.def b/gcc/target.def
>> index 4050b2ebdd4..c348b15815a 100644
>> --- a/gcc/target.def
>> +++ b/gcc/target.def
>> @@ -5714,6 +5714,18 @@ DEFHOOK
>> reg_class_t, (int, reg_class_t, reg_class_t),
>> default_ira_change_pseudo_allocno_class)
>>
>> +/* Scale of callee-saved register cost in epilogue and prologue used by
>> + IRA. */
>> +DEFHOOK
>> +(ira_callee_saved_register_cost_scale,
>> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
>> +cost scale in epilogue and prologue used by IRA.\n\
>> +\n\
>> +The default version of this target hook returns 1 if optimizing for\n\
>> +size, otherwise returns the entry block frequency.",
>> + int, (int hard_regno),
>> + default_ira_callee_saved_register_cost_scale)
>> +
>> /* Return true if we use LRA instead of reload. */
>> DEFHOOK
>> (lra_p,
>> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
>> index f80dc8b2e7e..344075efa41 100644
>> --- a/gcc/targhooks.cc
>> +++ b/gcc/targhooks.cc
>> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
>> return cl;
>> }
>>
>> +int
>> +default_ira_callee_saved_register_cost_scale (int)
>> +{
>> + return (optimize_size
>> + ? 1
>
> Why use optimize_size when REG_FREQ_FROM_BB uses
>
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ())
>
> ? You are changing behavior for all targets this way? Did you consider
> altering REG_FREQ_FROM_BB to
>
> #define REG_FREQ_FROM_BB(bb, size_value)
> ((optimize_function_for_size_p (cfun) \
> || !cfun->cfg->count_max.initialized_p ()) \
> ? default \
> : ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> ? ((bb)->count.to_frequency (cfun) \
> * REG_FREQ_MAX / BB_FREQ_MAX) \
> : 1)
>
> so return a specified value for the optimize_size case? (see the
> other thread where
> I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
>
> IMO at this point a new target hook should preserve existing behavior by default
> or alternatively the original patch should be reverted as causing
> regressions and
> a new patch introducing the target hook should be installed in next stage1.
I don't think we should add a new target hook unless it's providing
genuinely new information about the target. Hooking into the RA to
brute-force a particular heuristic makes it harder to improve the RA
in future.
There are already hooks that provide the costs of the relevant operations,
so I think we should concentrate on using those to get good results for
both Power and x86.
Richard
On Mon, Feb 3, 2025 at 6:29 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Mon, Feb 3, 2025 at 7:23 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> >> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> >> Date: Tue Jun 25 08:37:49 2024 -0500
> >>
> >> ira: Scale save/restore costs of callee save registers with block frequency
> >>
> >> scales the cost of saving/restoring a callee-save hard register in epilogue
> >> and prologue with the entry block frequency, which, if not optimizing for
> >> size, is 10000, for all targets. As the result, callee-saved registers
> >> may not be used to preserve local variable values across calls on some
> >> targets, like x86. Add a target hook for the callee-saved register cost
> >> scale in epilogue and prologue used by IRA. The default version of this
> >> target hook returns 1 if optimizing for size, otherwise returns the entry
> >> block frequency. Add an x86 version of this target hook to restore the
> >> old behavior prior to the above commit.
> >>
> >> PR rtl-optimization/111673
> >> PR rtl-optimization/115932
> >> PR rtl-optimization/116028
> >> PR rtl-optimization/117081
> >> PR rtl-optimization/117082
> >> PR rtl-optimization/118497
> >> * ira-color.cc (assign_hard_reg): Call the target hook for the
> >> callee-saved register cost scale in epilogue and prologue.
> >> * target.def (ira_callee_saved_register_cost_scale): New target
> >> hook.
> >> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> >> New.
> >> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> >> Likewise.
> >> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> >> New.
> >> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> >> * doc/tm.texi: Regenerated.
> >> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> >> New.
> >>
> >> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> >> ---
> >> gcc/config/i386/i386.cc | 11 +++++++++++
> >> gcc/doc/tm.texi | 8 ++++++++
> >> gcc/doc/tm.texi.in | 2 ++
> >> gcc/ira-color.cc | 3 +--
> >> gcc/target.def | 12 ++++++++++++
> >> gcc/targhooks.cc | 8 ++++++++
> >> gcc/targhooks.h | 1 +
> >> 7 files changed, 43 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> >> index f89201684a8..3128973ba79 100644
> >> --- a/gcc/config/i386/i386.cc
> >> +++ b/gcc/config/i386/i386.cc
> >> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> >> return false;
> >> }
> >>
> >> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> >> +
> >> +static int
> >> +ix86_ira_callee_saved_register_cost_scale (int)
> >> +{
> >> + return 1;
> >> +}
> >> +
> >> /* Return true if a set of DST by the expression SRC should be allowed.
> >> This prevents complex sets of likely_spilled hard regs before split1. */
> >>
> >> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> >> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> >> #undef TARGET_CLASS_LIKELY_SPILLED_P
> >> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> >> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> >> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> >> + ix86_ira_callee_saved_register_cost_scale
> >>
> >> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> >> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> >> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> >> index 0de24eda6f0..9f42913a4ef 100644
> >> --- a/gcc/doc/tm.texi
> >> +++ b/gcc/doc/tm.texi
> >> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> >> The default version of this target hook always returns given class.
> >> @end deftypefn
> >>
> >> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> >> +A target hook which returns the callee-saved register @var{hard_regno}
> >> +cost scale in epilogue and prologue used by IRA.
> >> +
> >> +The default version of this target hook returns 1 if optimizing for
> >> +size, otherwise returns the entry block frequency.
> >> +@end deftypefn
> >> +
> >> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> >> A target hook which returns true if we use LRA instead of reload pass.
> >>
> >> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> >> index 631d04131e3..6dbe22581ca 100644
> >> --- a/gcc/doc/tm.texi.in
> >> +++ b/gcc/doc/tm.texi.in
> >> @@ -2388,6 +2388,8 @@ in the reload pass.
> >>
> >> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> >>
> >> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> >> +
> >> @hook TARGET_LRA_P
> >>
> >> @hook TARGET_REGISTER_PRIORITY
> >> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> >> index 0699b349a1a..233060e1587 100644
> >> --- a/gcc/ira-color.cc
> >> +++ b/gcc/ira-color.cc
> >> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> >> + ira_memory_move_cost[mode][rclass][1])
> >> * saved_nregs / hard_regno_nregs (hard_regno,
> >> mode) - 1)
> >> - * (optimize_size ? 1 :
> >> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> >> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> >> cost += add_cost;
> >> full_cost += add_cost;
> >> }
> >> diff --git a/gcc/target.def b/gcc/target.def
> >> index 4050b2ebdd4..c348b15815a 100644
> >> --- a/gcc/target.def
> >> +++ b/gcc/target.def
> >> @@ -5714,6 +5714,18 @@ DEFHOOK
> >> reg_class_t, (int, reg_class_t, reg_class_t),
> >> default_ira_change_pseudo_allocno_class)
> >>
> >> +/* Scale of callee-saved register cost in epilogue and prologue used by
> >> + IRA. */
> >> +DEFHOOK
> >> +(ira_callee_saved_register_cost_scale,
> >> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> >> +cost scale in epilogue and prologue used by IRA.\n\
> >> +\n\
> >> +The default version of this target hook returns 1 if optimizing for\n\
> >> +size, otherwise returns the entry block frequency.",
> >> + int, (int hard_regno),
> >> + default_ira_callee_saved_register_cost_scale)
> >> +
> >> /* Return true if we use LRA instead of reload. */
> >> DEFHOOK
> >> (lra_p,
> >> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> >> index f80dc8b2e7e..344075efa41 100644
> >> --- a/gcc/targhooks.cc
> >> +++ b/gcc/targhooks.cc
> >> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> >> return cl;
> >> }
> >>
> >> +int
> >> +default_ira_callee_saved_register_cost_scale (int)
> >> +{
> >> + return (optimize_size
> >> + ? 1
> >
> > Why use optimize_size when REG_FREQ_FROM_BB uses
> >
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ())
> >
> > ? You are changing behavior for all targets this way? Did you consider
> > altering REG_FREQ_FROM_BB to
> >
> > #define REG_FREQ_FROM_BB(bb, size_value)
> > ((optimize_function_for_size_p (cfun) \
> > || !cfun->cfg->count_max.initialized_p ()) \
> > ? default \
> > : ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > ? ((bb)->count.to_frequency (cfun) \
> > * REG_FREQ_MAX / BB_FREQ_MAX) \
> > : 1)
> >
> > so return a specified value for the optimize_size case? (see the
> > other thread where
> > I wonder about that odd !cfun->cfg->count_max.initialized_p () check)
> >
> > IMO at this point a new target hook should preserve existing behavior by default
> > or alternatively the original patch should be reverted as causing
> > regressions and
> > a new patch introducing the target hook should be installed in next stage1.
>
> I don't think we should add a new target hook unless it's providing
> genuinely new information about the target. Hooking into the RA to
> brute-force a particular heuristic makes it harder to improve the RA
> in future.
>
> There are already hooks that provide the costs of the relevant operations,
> so I think we should concentrate on using those to get good results for
> both Power and x86.
It isn't just about Power and x86.
commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
Date: Tue Jun 25 08:37:49 2024 -0500
ira: Scale save/restore costs of callee save registers with block frequency
caused regressions on many targets, including aarch64:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116028
I don't understand why frequency can be used to scale the
cost.
On 2/3/25 2:31 AM, H.J. Lu wrote:
>>
>> IMO at this point a new target hook should preserve existing behavior by default
>> or alternatively the original patch should be reverted as causing
>> regressions and
>> a new patch introducing the target hook should be installed in next stage1.
>
> I believe the original patch should be reverted. Then my patch isn't needed.
That patch had significant improvements across the board for RISC-V. I
wouldn't want to see it reverted without a strong explanation of why it
was wrong.
jeff
> > I don't think we should add a new target hook unless it's providing
> > genuinely new information about the target. Hooking into the RA to
> > brute-force a particular heuristic makes it harder to improve the RA
> > in future.
> >
> > There are already hooks that provide the costs of the relevant operations,
> > so I think we should concentrate on using those to get good results for
> > both Power and x86.
>
> It isn't just about Power and x86.
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> caused regressions on many targets, including aarch64:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116028
>
> I don't understand why frequency can be used to scale the
> cost.
Adding such absolute hook does not seem to make sense to me, expecially
if the problem happens on multiple targets.
Let me see if I understnad what is going on. Checking one PR the patch
broke testcase:
void foo (void);
void bar (void);
int
test (int a)
{
int r;
if (r = -a)
foo ();
else
bar ();
return r;
}
Here the cost model should decide whether extra spill in
prologe/epilogue is chaper than extra spill around calls to foo() and
bar().
REG_FREQ_FROM_BB should return values in range 1....REG_FREQ_MAX which
expresses relative frequencies of BBs compressed to 0...REG_FREQ_MAX.
Avoiding 0 is to prevent cost model from thinking that spilling is
completely free.
In the case above we should have entry_block_freq = REG_FREQ_MAX (so the
epilogue/prologue cost is scaled by 1000) and frequencies of foo/bar to
have something like REG_FREQ_MAX * p and REG_FREQ_MAX * (1-p) where p is
the probability of the conditional.
So I guess IRA ends up comparing
- spill_cost * REG_FREQ_MAX for using callee saved register
- spill_cost * (p + 1-p) * REG_FREQ_MAX for using caller saved register.
Which correctly represnets that in both case we will end up executing
same amount of spill code.
If we want it to choose variant with fewer static spill count, we could
add 1 to the final costs for every spill instruction which will biass
the cost model that way...
What would make sense would be to express to IRA that moves used in
prologue/epilogue are slightly chaper on x86 then reuglar moves, since we can use
push/pop pair which encode shorter than stack frame adjustment + regular
store/load used to spill around call.
Honza
>
> --
> H.J.
Hello,
On Mon, 3 Feb 2025, H.J. Lu wrote:
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets.
This merely represents the fact that the entry block is indeed entered
exactly once per function invocation, i.e. 1.0 in fixed point with a scale
of 1000. All costs in ira are (supposed to be) scaled by bb-frequency of
the allocno/register occurence, and hence this add_cost to cater for
xlogue-save/restore needs to be scaled by that as well, which is what
Suryas patch was adding.
Any fallout from that needs to be addressed on top of that, not by
reverting it, or by introducing a hook to avoid that. Think of this scale
as an arbitrary value to implement pseudo-fixed-point arithmetic for
costs. All values need to be scaled by it. That its value is a seemingly
large number of 1000 is not the worry, it represents 1.0 .
If the issue is for instance that callee-saved registers aren't used
because the prologue save/restore is now deemed too expensive relative to
the around-call-save-restore when a call-clobbered register is used, then
either the around-call-save-restore instructions aren't correctly costed
(perhaps also missing the scale factor?), or because ties aren't broken
nicely, in which case adding a 1 at one or the other place might be
needed.
Ciao,
Michael.
On 2/3/25 3:44 AM, Richard Biener wrote:
> On Mon, Feb 3, 2025 at 10:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>> I believe the original patch should be reverted. Then my patch isn't needed.
>
> I'm OK with that, but it's not my call. I do wonder why the contributor did not
> address any of the fallout. Maybe he's gone? Peter?
Surya (she) is working on the fallout. In fact, one patch earlier this year
was committed and reverted due to some aarch64 fallout. That said, Andrew
mentioned on IRC that he was interested in getting that patch back in for aarch64
because it helps shrink-wrapping and he believes the patch itself wasn't bad,
but exposed a latent issue that was causing the bootstrap issue on aarch64.
Surya also just recently submitted another patch to help with the original fallout:
[PATCH] lra: initialize allocated_hard_reg_p[] for hard regs referenced in RTL [PR118533]
...which you commented on. She is working on them.
I disagree with H.J.'s comment. I have said before at the Cauldron and in
some bugzilla's, that Surya's fix is a correct fix. The issues encountered
here seem to be latent issues exposed by Surya's fix (read also Matz's reply)
and as such, this patch should stay. The correct path here is to track down
those latent issues and fix those. I've asked Surya to continue to work on
the fallout, but any help from other's is greatly appreciated!
Reverting now would also cause performance regressions on Power, RISC-V and ARM.
Peter
On 2/3/25 7:14 AM, Jeff Law wrote:
> On 2/3/25 2:31 AM, H.J. Lu wrote:
>> I believe the original patch should be reverted. Then my patch isn't needed.
>
> That patch had significant improvements across the board for RISC-V.
> I wouldn't want to see it reverted without a strong explanation of why it was wrong.
In my opinion, the patch is not wrong, but rather has exposed latent issues
that need to be worked on and fixed. Ive asked Surya to continue working on
the fallout (see her other patches), but help from others is always appreciated.
Peter
> Hello,
>
> On Mon, 3 Feb 2025, H.J. Lu wrote:
>
> > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > Date: Tue Jun 25 08:37:49 2024 -0500
> >
> > ira: Scale save/restore costs of callee save registers with block frequency
> >
> > scales the cost of saving/restoring a callee-save hard register in epilogue
> > and prologue with the entry block frequency, which, if not optimizing for
> > size, is 10000, for all targets.
>
> This merely represents the fact that the entry block is indeed entered
> exactly once per function invocation, i.e. 1.0 in fixed point with a scale
> of 1000. All costs in ira are (supposed to be) scaled by bb-frequency of
> the allocno/register occurence, and hence this add_cost to cater for
> xlogue-save/restore needs to be scaled by that as well, which is what
> Suryas patch was adding.
This is nice summary. One correction is that REG_FREQ_MAX should
represent maximal frequency in function (i.e. innermost loop if it
exists). So entry bock has REG_FREQ_MAX only in functions with no BBs
that execute with frequency >1.
Freq 1 is frequency of entry block (which is correctly used by the
patch).
>
> Any fallout from that needs to be addressed on top of that, not by
> reverting it, or by introducing a hook to avoid that. Think of this scale
> as an arbitrary value to implement pseudo-fixed-point arithmetic for
> costs. All values need to be scaled by it. That its value is a seemingly
> large number of 1000 is not the worry, it represents 1.0 .
>
> If the issue is for instance that callee-saved registers aren't used
> because the prologue save/restore is now deemed too expensive relative to
> the around-call-save-restore when a call-clobbered register is used, then
> either the around-call-save-restore instructions aren't correctly costed
> (perhaps also missing the scale factor?), or because ties aren't broken
> nicely, in which case adding a 1 at one or the other place might be
> needed.
In the testcase I quoted from one of PRs the costs was simply
(dynamically) the same. there were two function calls with combine
frequency 1. We may want to add logic representing that static
instruction counts matters too, if the dynamic costs are the same which
indeed can be done by adding 1.
Other thing is that push/pop is shorter than mov which is target
specific knowledge.
Honza
>
>
> Ciao,
> Michael.
On 2/3/25 1:20 AM, H.J. Lu wrote:
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets. As the result, callee-saved registers
> may not be used to preserve local variable values across calls on some
> targets, like x86. Add a target hook for the callee-saved register cost
> scale in epilogue and prologue used by IRA. The default version of this
> target hook returns 1 if optimizing for size, otherwise returns the entry
> block frequency. Add an x86 version of this target hook to restore the
> old behavior prior to the above commit.
This is a complicated problem resulted in many tries to fix it in some
general way.
In general I am agree with Surya's approach to scale cost of reg
saves/restores somehow. But the general approach, although solved some
problems, also created a lot of new ones. May be because IRA does not
take some other aspects of using callee saved regs. And some of them
were addressed by other patches. e.g. recently proposed by Surya and one
for PR118497.
I also agree with Richard Sandiford's comment that we should avoid
introducing the new hooks for RA and I actually tried to stick to this
policy for a long time. But I don't see another solution to introducing
the new hook in this case. It is hard to figure out generally in RA
that saves/restores will be insns different from ld/st (e.g. x86
push/pop) and that they will be cheaper.
So after some time to think about the patch I decided to approve the RA
part of the patch. I also hope that the work on this problem will
continue (e.g. improving default and target hook implementations and
documentation how to better use it).
We have a lot of tests checking expected code generation (some of them
are wrong, e.g. expecting particular hard regs). But those are small
tests. We should pay more attention to check impact on SPEC for such
patches. It is more time-consuming but it is the right way. A patch can
introduce worse code generation for small test but can be a win for real
applications as ones in SPEC. Any RA in industrial compiler can not
generate optimal code for most cases. So I would not mark such GCC test
failures as P1 PRs.
The RA patch is OK for me for committing it into the trunk. Thank you.
> PR rtl-optimization/111673
> PR rtl-optimization/115932
> PR rtl-optimization/116028
> PR rtl-optimization/117081
> PR rtl-optimization/117082
> PR rtl-optimization/118497
> * ira-color.cc (assign_hard_reg): Call the target hook for the
> callee-saved register cost scale in epilogue and prologue.
> * target.def (ira_callee_saved_register_cost_scale): New target
> hook.
> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> New.
> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> New.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> * doc/tm.texi: Regenerated.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> New.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386.cc | 11 +++++++++++
> gcc/doc/tm.texi | 8 ++++++++
> gcc/doc/tm.texi.in | 2 ++
> gcc/ira-color.cc | 3 +--
> gcc/target.def | 12 ++++++++++++
> gcc/targhooks.cc | 8 ++++++++
> gcc/targhooks.h | 1 +
> 7 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f89201684a8..3128973ba79 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +
> +static int
> +ix86_ira_callee_saved_register_cost_scale (int)
> +{
> + return 1;
> +}
> +
> /* Return true if a set of DST by the expression SRC should be allowed.
> This prevents complex sets of likely_spilled hard regs before split1. */
>
> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> + ix86_ira_callee_saved_register_cost_scale
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..9f42913a4ef 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> The default version of this target hook always returns given class.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> +A target hook which returns the callee-saved register @var{hard_regno}
> +cost scale in epilogue and prologue used by IRA.
> +
> +The default version of this target hook returns 1 if optimizing for
> +size, otherwise returns the entry block frequency.
> +@end deftypefn
> +
> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> A target hook which returns true if we use LRA instead of reload pass.
>
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..6dbe22581ca 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -2388,6 +2388,8 @@ in the reload pass.
>
> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>
> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +
> @hook TARGET_LRA_P
>
> @hook TARGET_REGISTER_PRIORITY
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..233060e1587 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> cost += add_cost;
> full_cost += add_cost;
> }
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..c348b15815a 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -5714,6 +5714,18 @@ DEFHOOK
> reg_class_t, (int, reg_class_t, reg_class_t),
> default_ira_change_pseudo_allocno_class)
>
> +/* Scale of callee-saved register cost in epilogue and prologue used by
> + IRA. */
> +DEFHOOK
> +(ira_callee_saved_register_cost_scale,
> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> +cost scale in epilogue and prologue used by IRA.\n\
> +\n\
> +The default version of this target hook returns 1 if optimizing for\n\
> +size, otherwise returns the entry block frequency.",
> + int, (int hard_regno),
> + default_ira_callee_saved_register_cost_scale)
> +
> /* Return true if we use LRA instead of reload. */
> DEFHOOK
> (lra_p,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..344075efa41 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> return cl;
> }
>
> +int
> +default_ira_callee_saved_register_cost_scale (int)
> +{
> + return (optimize_size
> + ? 1
> + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +}
> +
> extern bool
> default_lra_p (void)
> {
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..8871e01430c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> reg_class_t);
> +extern int default_ira_callee_saved_register_cost_scale (int);
> extern bool default_lra_p (void);
> extern int default_register_priority (int);
> extern bool default_register_usage_leveling_p (void);
Vladimir Makarov <vmakarov@redhat.com> writes:
> On 2/3/25 1:20 AM, H.J. Lu wrote:
>> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
>> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
>> Date: Tue Jun 25 08:37:49 2024 -0500
>>
>> ira: Scale save/restore costs of callee save registers with block frequency
>>
>> scales the cost of saving/restoring a callee-save hard register in epilogue
>> and prologue with the entry block frequency, which, if not optimizing for
>> size, is 10000, for all targets. As the result, callee-saved registers
>> may not be used to preserve local variable values across calls on some
>> targets, like x86. Add a target hook for the callee-saved register cost
>> scale in epilogue and prologue used by IRA. The default version of this
>> target hook returns 1 if optimizing for size, otherwise returns the entry
>> block frequency. Add an x86 version of this target hook to restore the
>> old behavior prior to the above commit.
>
> This is a complicated problem resulted in many tries to fix it in some
> general way.
>
> In general I am agree with Surya's approach to scale cost of reg
> saves/restores somehow. But the general approach, although solved some
> problems, also created a lot of new ones. May be because IRA does not
> take some other aspects of using callee saved regs. And some of them
> were addressed by other patches. e.g. recently proposed by Surya and one
> for PR118497.
>
> I also agree with Richard Sandiford's comment that we should avoid
> introducing the new hooks for RA and I actually tried to stick to this
> policy for a long time. But I don't see another solution to introducing
> the new hook in this case. It is hard to figure out generally in RA
> that saves/restores will be insns different from ld/st (e.g. x86
> push/pop) and that they will be cheaper.
>
> So after some time to think about the patch I decided to approve the RA
> part of the patch. I also hope that the work on this problem will
> continue (e.g. improving default and target hook implementations and
> documentation how to better use it).
For the record, I strongly object to this. The hook just seems like a
complete hack to me. Even if we accept that there is target-specific
information in play, the hook isn't providing that information.
In contrast to what you said above, my objection isn't to having hooks
-- those are often needed and good. What bothers me is that the hook
isn't well designed. Hooks should provide information rather than
override code by brute force.
On the other hand, I accept that you're (rightly!) the maintainer.
Thanks,
Richard
> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > +
> > +static int
> > +ix86_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return 1;
> > +}
> > +
> > return cl;
> > }
> > +int
> > +default_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return (optimize_size
> > + ? 1
> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > +}
> > +
I am not sure how this makes sense - why x86 would be significantly
different from other targets?
I think the only bit non-standard thing is that prologue/epilogue code
can use push/pop that is shorter than mov used to save caller saved
registers.
I went through few testcases:
void d();
void a()
{
int b;
asm ("use %0":"=r" (b));
d();
asm volatile (""::"r" (b));
}
compiler with -O2 -fira-verbose=10000 gives:
Popping a0(r99,l0) -- (0=12000,12000) (1=12000,12000) (2=12000,12000) (4=12000,12000) (5=12000,12000) (36=12000,12000) (37=12000,12000) (38=12000,12000) (39=12000,12000) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
load and save costs are 6. So spill pair is 12 weighted by 1000 that is
REG_FREQ_MAX.
Register 0 (EAX) has cost 12000 which makes sense to me:
- load and save costs are 6, combined spill pair is 12
- REG_FREQ_MAX is 1000 and since function has only one BB, it has
maximal frequency, so we get 12000.
Register 3 (first caller saved) has cost 11000. This comes from:
add_cost = ((ira_memory_move_cost[mode][rclass][0]
+ ira_memory_move_cost[mode][rclass][1])
* saved_nregs / hard_regno_nregs (hard_regno,
mode) - 1)
^^
here
* (optimize_size ? 1 :
REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
There is no comment why -1, but I suppose it is there to biass costs to
use prologue/epilogue instad of caller save sequence when runtime cost
estimate is even.
Now for
void d();
void a()
{
for (int i = 0; i < 100; i++)
d();
int b;
asm ("use %0":"=r" (b));
d();
asm volatile (""::"r" (b));
}
I get
Popping a0(r100,l0) -- (0=120,120) (1=120,120) (2=120,120) (4=120,120) (5=120,120) (36=120,120) (37=120,120) (38=120,120) (39=120,120) (3=0,0) (6=110,110) (40=110,110) (41=110,110) (42=110,110) (43=110,110)
This also makes sense to me, since there is loop the basic block has
lower frequency of 10, thus costs are scaled down.
void d();
int cnd;
void a()
{
int b;
asm ("use %0":"=r" (b));
if (__builtin_expect_with_probability (cnd, 1, 0.8))
d();
asm volatile (""::"r" (b));
}
I get
Popping a0(r100,l0) -- (0=9600,9600) (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
which seems also correct. It is better to use caller saved registr
since call to d() has lower frequency then the entry basic block. This
is what gcc 13 and this patch gets wrong
Popping a0(r100,l0) -- (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11,11) (6=11,11) (40=11,11) (41=11,11) (42=11,11) (43=11,11)
Due to missing scaling factor we think that using callee saved registr
is win while it is not. GCC13 gets this wrong even for probability 0.
Looking into PRs referneced in the patch:
PR111673 is the original bug that motivated correcting the cost (adding
the scale by entry block frequency)
PR115932 is cris-elf I don't know how to bencmark easily.
PR116028 seems to be about shrink wrapping in
void f(int *i)
{
if (!i)
return;
else
{
__builtin_printf("Hi");
*i=0;
}
}
here I see tha tthe cost model misses the fact that epilogue will be
shrink-wrapped so both caller and callee saving will result in one spill
after the early exit.
PR117081 is about regression in povray. The reducted testcase:
void foo (void);
void bar (void);
int
test (int a)
{
int r;
if (r = -a)
foo ();
else
bar ();
return r;
}
shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
saved register and produces:
0000000000000000 <test>:
0: 53 push %rbx <--- callee save
1: 89 fb mov %edi,%ebx <--- move 1
3: f7 db neg %ebx
5: 74 09 je 10 <test+0x10>
7: e8 00 00 00 00 call c <test+0xc>
c: 89 d8 mov %ebx,%eax <--- callee restore
e: 5b pop %rbx
f: c3 ret
10: e8 00 00 00 00 call 15 <test+0x15>
15: 89 d8 mov %ebx,%eax <--- move 2
17: 5b pop %rbx <--- callee restore
18: c3 ret
Mainline used EAX since it has costs 13000. It is not 100% clear to me
why.
- 12000 is the spilling (which is emitted twice but executed just once)
- I would have expected 2000 for the move from edi to eax.
However even if cost is 14000 we will choose EAX. The code is:
0: 89 f8 mov %edi,%eax <--- move1
2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
6: f7 d8 neg %eax
8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
c: 85 ff test %edi,%edi
e: 74 10 je 20 <test+0x20>
10: e8 00 00 00 00 call 15 <test+0x15>
15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
1d: c3 ret
1e: 66 90 xchg %ax,%ax
20: e8 00 00 00 00 call 25 <test+0x25>
25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
2d: c3 ret
This sequence really saves one move at the expense of of stack frame
allocation (which is not modelled by the cost model) and longer spill
code (also no modelled).
PR117082 is about noreturn function:
__attribute__ ((noreturn))
void
f3 (void)
{
int y0 = x0;
int y1 = x1;
f1 ();
f2 (y0, y1);
while (1);
}
Here the cost model is really wrong by assuming that entry and exit
block have same frequencies. This can be fixed quite easilly (though it
is a rare case)
PR118497 seems to be ixed.
So overall I think
1) we can fix scaling of epilogue by exit block frequency
to get noreturns right.
2) we should drop the check for optimize_size. Since with -Os
REG_FREQ_FROM_BB always returns 1000 everything should be scaled
same way
3) we currently have wire in "-1" to biass the cost metric for callee
saved registers.
It may make sense to allow targets to control this, since i.e. x86
has push/pop that is shorter. -3 would solve the testcase with neg
and would express that push/pop is still cheaper with extra reg-reg
move.
4) cost model misses shring wrapping, the fact that if register is
callee saved it may be used by multiple allocnos and also that
push/pop sequence may avoid need for manual RSP adjustments.
Those seems bit harder things to fit in though.
So if we want to go with the target hook, I think it should adjust the
cost before scalling (since targets may have special tricks for
prologues) rather than the scale factor (which is target independent
part of cost model).
Honza
On 2/6/25 4:54 PM, Richard Sandiford wrote:
> Vladimir Makarov<vmakarov@redhat.com> writes:
>> This is a complicated problem resulted in many tries to fix it in some
>> general way.
>>
>> In general I am agree with Surya's approach to scale cost of reg
>> saves/restores somehow. But the general approach, although solved some
>> problems, also created a lot of new ones. May be because IRA does not
>> take some other aspects of using callee saved regs. And some of them
>> were addressed by other patches. e.g. recently proposed by Surya and one
>> for PR118497.
>>
>> I also agree with Richard Sandiford's comment that we should avoid
>> introducing the new hooks for RA and I actually tried to stick to this
>> policy for a long time. But I don't see another solution to introducing
>> the new hook in this case. It is hard to figure out generally in RA
>> that saves/restores will be insns different from ld/st (e.g. x86
>> push/pop) and that they will be cheaper.
>>
>> So after some time to think about the patch I decided to approve the RA
>> part of the patch. I also hope that the work on this problem will
>> continue (e.g. improving default and target hook implementations and
>> documentation how to better use it).
> For the record, I strongly object to this. The hook just seems like a
> complete hack to me. Even if we accept that there is target-specific
> information in play, the hook isn't providing that information.
>
> In contrast to what you said above, my objection isn't to having hooks
> -- those are often needed and good. What bothers me is that the hook
> isn't well designed. Hooks should provide information rather than
> override code by brute force.
>
> On the other hand, I accept that you're (rightly!) the maintainer.
>
I also don't like the hook implementation for x86-64 (although this is a
matter of target maintainers). All these costs look voodoo and random
to me.
But this problem is longing for more than half year. I spent a lot of
time too on this. Patches were submitted and reverted and nobody did
find so far any solution satisfying all GCC tests. If somebody finds a
solution without the hook, I will be glad to get rid off the hook. Also
the related PRs are marked as P1 ones, it means people think they are
important (I am not sure about this myself). Without fixing them (or
downgrading them) there will be no GCC release. So I am in difficult
situation with these PRs and need some resolution.
On Thu, Feb 6, 2025 at 11:40 PM Vladimir Makarov <vmakarov@redhat.com> wrote:
>
>
> On 2/6/25 4:54 PM, Richard Sandiford wrote:
>
> Vladimir Makarov <vmakarov@redhat.com> writes:
>
> This is a complicated problem resulted in many tries to fix it in some
> general way.
>
> In general I am agree with Surya's approach to scale cost of reg
> saves/restores somehow. But the general approach, although solved some
> problems, also created a lot of new ones. May be because IRA does not
> take some other aspects of using callee saved regs. And some of them
> were addressed by other patches. e.g. recently proposed by Surya and one
> for PR118497.
>
> I also agree with Richard Sandiford's comment that we should avoid
> introducing the new hooks for RA and I actually tried to stick to this
> policy for a long time. But I don't see another solution to introducing
> the new hook in this case. It is hard to figure out generally in RA
> that saves/restores will be insns different from ld/st (e.g. x86
> push/pop) and that they will be cheaper.
>
> So after some time to think about the patch I decided to approve the RA
> part of the patch. I also hope that the work on this problem will
> continue (e.g. improving default and target hook implementations and
> documentation how to better use it).
>
> For the record, I strongly object to this. The hook just seems like a
> complete hack to me. Even if we accept that there is target-specific
> information in play, the hook isn't providing that information.
>
> In contrast to what you said above, my objection isn't to having hooks
> -- those are often needed and good. What bothers me is that the hook
> isn't well designed. Hooks should provide information rather than
> override code by brute force.
>
> On the other hand, I accept that you're (rightly!) the maintainer.
>
> I also don't like the hook implementation for x86-64 (although this is a matter of target maintainers). All these costs look voodoo and random to me.
>
> But this problem is longing for more than half year. I spent a lot of time too on this. Patches were submitted and reverted and nobody did find so far any solution satisfying all GCC tests. If somebody finds a solution without the hook, I will be glad to get rid off the hook. Also the related PRs are marked as P1 ones, it means people think they are important (I am not sure about this myself). Without fixing them (or downgrading them) there will be no GCC release. So I am in difficult situation with these PRs and need some resolution.
Just to chime in as the one who likely made those PRs P1. 'P1' here
is really about the testsuite FAIL,
how to resolve it is up to target maintainers - P1 should make them
look, and adjusting the testcase
(or even XFAILing it) is a valid resolution of the P1 regression.
I do agree with Richards comment on the proposed hook to be badly
designed and I do like
Honzas suggestion to instead hookize the biasing.
Richard.
>
>
> On Thu, Feb 6, 2025 at 11:40 PM Vladimir Makarov <vmakarov@redhat.com> wrote:
> >
> >
> > On 2/6/25 4:54 PM, Richard Sandiford wrote:
> >
> > Vladimir Makarov <vmakarov@redhat.com> writes:
> >
> > This is a complicated problem resulted in many tries to fix it in some
> > general way.
> >
> > In general I am agree with Surya's approach to scale cost of reg
> > saves/restores somehow. But the general approach, although solved some
> > problems, also created a lot of new ones. May be because IRA does not
> > take some other aspects of using callee saved regs. And some of them
> > were addressed by other patches. e.g. recently proposed by Surya and one
> > for PR118497.
> >
> > I also agree with Richard Sandiford's comment that we should avoid
> > introducing the new hooks for RA and I actually tried to stick to this
> > policy for a long time. But I don't see another solution to introducing
> > the new hook in this case. It is hard to figure out generally in RA
> > that saves/restores will be insns different from ld/st (e.g. x86
> > push/pop) and that they will be cheaper.
> >
> > So after some time to think about the patch I decided to approve the RA
> > part of the patch. I also hope that the work on this problem will
> > continue (e.g. improving default and target hook implementations and
> > documentation how to better use it).
> >
> > For the record, I strongly object to this. The hook just seems like a
> > complete hack to me. Even if we accept that there is target-specific
> > information in play, the hook isn't providing that information.
> >
> > In contrast to what you said above, my objection isn't to having hooks
> > -- those are often needed and good. What bothers me is that the hook
> > isn't well designed. Hooks should provide information rather than
> > override code by brute force.
> >
> > On the other hand, I accept that you're (rightly!) the maintainer.
> >
> > I also don't like the hook implementation for x86-64 (although this is a matter of target maintainers). All these costs look voodoo and random to me.
> >
> > But this problem is longing for more than half year. I spent a lot of time too on this. Patches were submitted and reverted and nobody did find so far any solution satisfying all GCC tests. If somebody finds a solution without the hook, I will be glad to get rid off the hook. Also the related PRs are marked as P1 ones, it means people think they are important (I am not sure about this myself). Without fixing them (or downgrading them) there will be no GCC release. So I am in difficult situation with these PRs and need some resolution.
>
> Just to chime in as the one who likely made those PRs P1. 'P1' here
> is really about the testsuite FAIL,
> how to resolve it is up to target maintainers - P1 should make them
> look, and adjusting the testcase
> (or even XFAILing it) is a valid resolution of the P1 regression.
I certainly understand the pain with heuirstics changes distubring
testsuite and bringing some regressions and some improvements making it
difficult to weight overall impact. With inliner heuristics this
happens often.
We are good on tracking regressions, but if something improves, it goes
without saying. If some heuristics is formely wrong and fixing it
introduces some problems, we are kind of biassed towards keeping old,
broken one. Consistency is good, but I think in this specific case we
should see overall improvements.
The testcases where we previously shrink-wrapped and now use caller
saved registers are IMO not necessarily a problem. We had bug in cost
model that made us to use callee saved registers too often and
shrink-wrapping helped to mitigate it. Using caller saved register
has same effect if frequency of caller saving sums to 1. So perhaps
adjusting the testcases to have more calls will restore original
behaviour in cases where they are intendd to verify that shrink-wrapping
works.
Naturally it would be nice to make the cost model anticipate the
shrink-wrapping but that looks like quite hard problem to me, since
precise conditions are known only post-reload. Perhaps one can special
case some early exit conditions where shrink-wrapping is most important.
I will prepare patch for noreturns. They are probably not too important
in practice but easy to get right.
Honza
Really nice analysis! Thanks for writing this up.
Sorry for the big quote below, but:
Jan Hubicka <hubicka@ucw.cz> writes:
>> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
>> > +
>> > +static int
>> > +ix86_ira_callee_saved_register_cost_scale (int)
>> > +{
>> > + return 1;
>> > +}
>> > +
>
>> > return cl;
>> > }
>> > +int
>> > +default_ira_callee_saved_register_cost_scale (int)
>> > +{
>> > + return (optimize_size
>> > + ? 1
>> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>> > +}
>> > +
>
> I am not sure how this makes sense - why x86 would be significantly
> different from other targets?
>
> I think the only bit non-standard thing is that prologue/epilogue code
> can use push/pop that is shorter than mov used to save caller saved
> registers.
>
> I went through few testcases:
>
> void d();
> void a()
> {
> int b;
> asm ("use %0":"=r" (b));
> d();
> asm volatile (""::"r" (b));
> }
> compiler with -O2 -fira-verbose=10000 gives:
>
> Popping a0(r99,l0) -- (0=12000,12000) (1=12000,12000) (2=12000,12000) (4=12000,12000) (5=12000,12000) (36=12000,12000) (37=12000,12000) (38=12000,12000) (39=12000,12000) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
>
> load and save costs are 6. So spill pair is 12 weighted by 1000 that is
> REG_FREQ_MAX.
>
> Register 0 (EAX) has cost 12000 which makes sense to me:
> - load and save costs are 6, combined spill pair is 12
> - REG_FREQ_MAX is 1000 and since function has only one BB, it has
> maximal frequency, so we get 12000.
>
> Register 3 (first caller saved) has cost 11000. This comes from:
> add_cost = ((ira_memory_move_cost[mode][rclass][0]
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> ^^
> here
>
> * (optimize_size ? 1 :
> REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>
> There is no comment why -1, but I suppose it is there to biass costs to
> use prologue/epilogue instad of caller save sequence when runtime cost
> estimate is even.
>
> Now for
> void d();
> void a()
> {
> for (int i = 0; i < 100; i++)
> d();
> int b;
> asm ("use %0":"=r" (b));
> d();
> asm volatile (""::"r" (b));
> }
>
> I get
>
> Popping a0(r100,l0) -- (0=120,120) (1=120,120) (2=120,120) (4=120,120) (5=120,120) (36=120,120) (37=120,120) (38=120,120) (39=120,120) (3=0,0) (6=110,110) (40=110,110) (41=110,110) (42=110,110) (43=110,110)
>
> This also makes sense to me, since there is loop the basic block has
> lower frequency of 10, thus costs are scaled down.
>
> void d();
> int cnd;
> void a()
> {
> int b;
> asm ("use %0":"=r" (b));
>
> if (__builtin_expect_with_probability (cnd, 1, 0.8))
> d();
> asm volatile (""::"r" (b));
> }
>
> I get
>
> Popping a0(r100,l0) -- (0=9600,9600) (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11000,11000) (6=11000,11000) (40=11000,11000) (41=11000,11000) (42=11000,11000) (43=11000,11000)
>
> which seems also correct. It is better to use caller saved registr
> since call to d() has lower frequency then the entry basic block. This
> is what gcc 13 and this patch gets wrong
>
> Popping a0(r100,l0) -- (1=9600,9600) (2=9600,9600) (4=9600,9600) (5=9600,9600) (36=9600,9600) (37=9600,9600) (38=9600,9600) (39=9600,9600) (3=11,11) (6=11,11) (40=11,11) (41=11,11) (42=11,11) (43=11,11)
>
> Due to missing scaling factor we think that using callee saved registr
> is win while it is not. GCC13 gets this wrong even for probability 0.
>
> Looking into PRs referneced in the patch:
> PR111673 is the original bug that motivated correcting the cost (adding
> the scale by entry block frequency)
> PR115932 is cris-elf I don't know how to bencmark easily.
> PR116028 seems to be about shrink wrapping in
>
> void f(int *i)
> {
> if (!i)
> return;
> else
> {
> __builtin_printf("Hi");
> *i=0;
> }
> }
>
> here I see tha tthe cost model misses the fact that epilogue will be
> shrink-wrapped so both caller and callee saving will result in one spill
> after the early exit.
>
> PR117081 is about regression in povray. The reducted testcase:
>
> void foo (void);
> void bar (void);
>
> int
> test (int a)
> {
> int r;
>
> if (r = -a)
> foo ();
> else
> bar ();
>
> return r;
> }
>
> shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
> Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
>
> here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
> multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
> saved register and produces:
>
> 0000000000000000 <test>:
> 0: 53 push %rbx <--- callee save
> 1: 89 fb mov %edi,%ebx <--- move 1
> 3: f7 db neg %ebx
> 5: 74 09 je 10 <test+0x10>
> 7: e8 00 00 00 00 call c <test+0xc>
> c: 89 d8 mov %ebx,%eax <--- callee restore
> e: 5b pop %rbx
> f: c3 ret
> 10: e8 00 00 00 00 call 15 <test+0x15>
> 15: 89 d8 mov %ebx,%eax <--- move 2
> 17: 5b pop %rbx <--- callee restore
> 18: c3 ret
>
> Mainline used EAX since it has costs 13000. It is not 100% clear to me
> why.
> - 12000 is the spilling (which is emitted twice but executed just once)
> - I would have expected 2000 for the move from edi to eax.
> However even if cost is 14000 we will choose EAX. The code is:
>
> 0: 89 f8 mov %edi,%eax <--- move1
> 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
> 6: f7 d8 neg %eax
> 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
> c: 85 ff test %edi,%edi
> e: 74 10 je 20 <test+0x20>
> 10: e8 00 00 00 00 call 15 <test+0x15>
> 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> 1d: c3 ret
> 1e: 66 90 xchg %ax,%ax
> 20: e8 00 00 00 00 call 25 <test+0x25>
> 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> 2d: c3 ret
>
> This sequence really saves one move at the expense of of stack frame
> allocation (which is not modelled by the cost model) and longer spill
> code (also no modelled).
Kind of a tangent, but is a cost of 2 reasonable for these reg<-reg moves,
or is it too high? Reducing moves seems like the wrong thing to optimise
for (when optimising for speed) if the moves disappear during renaming anyway.
> PR117082 is about noreturn function:
> __attribute__ ((noreturn))
> void
> f3 (void)
> {
> int y0 = x0;
> int y1 = x1;
> f1 ();
> f2 (y0, y1);
> while (1);
> }
>
> Here the cost model is really wrong by assuming that entry and exit
> block have same frequencies. This can be fixed quite easilly (though it
> is a rare case)
Yeah (and nice example).
> PR118497 seems to be ixed.
>
> So overall I think
> 1) we can fix scaling of epilogue by exit block frequency
> to get noreturns right.
> 2) we should drop the check for optimize_size. Since with -Os
> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
> same way
> 3) we currently have wire in "-1" to biass the cost metric for callee
> saved registers.
> It may make sense to allow targets to control this, since i.e. x86
> has push/pop that is shorter. -3 would solve the testcase with neg
> and would express that push/pop is still cheaper with extra reg-reg
> move.
> 4) cost model misses shring wrapping, the fact that if register is
> callee saved it may be used by multiple allocnos and also that
> push/pop sequence may avoid need for manual RSP adjustments.
>
> Those seems bit harder things to fit in though.
>
> So if we want to go with the target hook, I think it should adjust the
> cost before scalling (since targets may have special tricks for
> prologues) rather than the scale factor (which is target independent
> part of cost model).
Like you say, one of the missing pieces appears to be the allocation/
dealloaction overhead for callee saves. Could we try to add a hook to
model that cost, based on certain parameters?
In particular, one thing that the examples above have in common is that
they don't need to allocate a frame for local variables. That seems
like it ought to be part of the mix. If we need to allocate a frame
using addition anyway, then presumably one of the advantages of push/pop
over callee saves goes away.
But going back to my question above, modelling the allocation and
deallocation would need to be done in a way that makes them more
expensive than moves. I think in some cases we've assumed that
addition and moves have equal cost, even when optimising for speed.
In other words, rather than add a hook to tweak the -1 bias (which I
still agree is a more reasonable thing to do than bypassing the IRA
code altogether), could we add a separate hook for the allocation/
deallocation and leave IRA to work out when to apply it?
I suppose for -fno-omit-frame-pointer we should also take the creation
of the frame link into account, if we don't already. This matters on
aarch64 since -fomit-frame-pointer is not the default at -O2.
One quirk on aarch64 is that, if we're using an odd number of
callee-saved GPRs, adding one more is essentially free, since we would
allocate space for it anyway, and could save and restore it alongside
the odd one out. That would probably be difficult to apply in practice
though. And it's obviously a separate issue from the current one.
Just mentioning it as another thing we could model in future.
Thanks,
Richard
Richard Sandiford <richard.sandiford@arm.com> writes:
> In particular, one thing that the examples above have in common is that
> they don't need to allocate a frame for local variables. That seems
> like it ought to be part of the mix. If we need to allocate a frame
> using addition anyway, then presumably one of the advantages of push/pop
> over callee saves goes away.
Gah, of course I meant caller saves.
> >
> > 0: 89 f8 mov %edi,%eax <--- move1
> > 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
> > 6: f7 d8 neg %eax
> > 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
> > c: 85 ff test %edi,%edi
> > e: 74 10 je 20 <test+0x20>
> > 10: e8 00 00 00 00 call 15 <test+0x15>
> > 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> > 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> > 1d: c3 ret
> > 1e: 66 90 xchg %ax,%ax
> > 20: e8 00 00 00 00 call 25 <test+0x25>
> > 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> > 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> > 2d: c3 ret
> >
> > This sequence really saves one move at the expense of of stack frame
> > allocation (which is not modelled by the cost model) and longer spill
> > code (also no modelled).
>
> Kind of a tangent, but is a cost of 2 reasonable for these reg<-reg moves,
> or is it too high? Reducing moves seems like the wrong thing to optimise
> for (when optimising for speed) if the moves disappear during renaming anyway.
The costs are scaled so 2 is cost of usual reg-reg move. This
definitely predates me, but it is how reload worked. It used cost 2 for
move and 1 was used to add various biasses (such as ! in constraints).
It used scale of 3^depth to avoid spilling in deeper loop nests and I
have introduced the frequency metric instead.
Loat/store costs are thus relative to reg-reg move cost divided by two.
For generic we set cost of load/store to be 6, which is usual latency of
the operation (3 cycles). On modern x86 architectures this is all
relative, since reg-reg move has good chance to have latency of 1
and store+load pairs may execute faster, too.
>
> Like you say, one of the missing pieces appears to be the allocation/
> dealloaction overhead for callee saves. Could we try to add a hook to
> model that cost, based on certain parameters?
>
> In particular, one thing that the examples above have in common is that
> they don't need to allocate a frame for local variables. That seems
> like it ought to be part of the mix. If we need to allocate a frame
> using addition anyway, then presumably one of the advantages of push/pop
> over callee saves goes away.
Even if you need to allocate frame, push/pop has advantage of short
encoding. But indeed it would be interesting to model the fact that
introducing frame has some cost.
On modern x86 the cost of stack adjustments is relatively low, since
they has stack engines.
>
> But going back to my question above, modelling the allocation and
> deallocation would need to be done in a way that makes them more
> expensive than moves. I think in some cases we've assumed that
> addition and moves have equal cost, even when optimising for speed.
Execution costs of stack adjustment are probalby not much bigger then
those of reg-reg move. so cost of 2 or 3 would make sense to me
>
> In other words, rather than add a hook to tweak the -1 bias (which I
> still agree is a more reasonable thing to do than bypassing the IRA
> code altogether), could we add a separate hook for the allocation/
> deallocation and leave IRA to work out when to apply it?
>
> I suppose for -fno-omit-frame-pointer we should also take the creation
> of the frame link into account, if we don't already. This matters on
> aarch64 since -fomit-frame-pointer is not the default at -O2.
>
> One quirk on aarch64 is that, if we're using an odd number of
> callee-saved GPRs, adding one more is essentially free, since we would
> allocate space for it anyway, and could save and restore it alongside
> the odd one out. That would probably be difficult to apply in practice
> though. And it's obviously a separate issue from the current one.
> Just mentioning it as another thing we could model in future.
Looks like fun, indeed.
Honza
>
> Thanks,
> Richard
Richard Sandiford <richard.sandiford@arm.com> writes:
> Really nice analysis! Thanks for writing this up.
>
> Sorry for the big quote below, but:
>
> Jan Hubicka <hubicka@ucw.cz> writes:
>> [...]
>> PR117081 is about regression in povray. The reducted testcase:
>>
>> void foo (void);
>> void bar (void);
>>
>> int
>> test (int a)
>> {
>> int r;
>>
>> if (r = -a)
>> foo ();
>> else
>> bar ();
>>
>> return r;
>> }
>>
>> shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
>> Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
>>
>> here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
>> multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
>> saved register and produces:
>>
>> 0000000000000000 <test>:
>> 0: 53 push %rbx <--- callee save
>> 1: 89 fb mov %edi,%ebx <--- move 1
>> 3: f7 db neg %ebx
>> 5: 74 09 je 10 <test+0x10>
>> 7: e8 00 00 00 00 call c <test+0xc>
>> c: 89 d8 mov %ebx,%eax <--- callee restore
>> e: 5b pop %rbx
>> f: c3 ret
>> 10: e8 00 00 00 00 call 15 <test+0x15>
>> 15: 89 d8 mov %ebx,%eax <--- move 2
>> 17: 5b pop %rbx <--- callee restore
>> 18: c3 ret
>>
>> Mainline used EAX since it has costs 13000. It is not 100% clear to me
>> why.
>> - 12000 is the spilling (which is emitted twice but executed just once)
>> - I would have expected 2000 for the move from edi to eax.
>> However even if cost is 14000 we will choose EAX. The code is:
>>
>> 0: 89 f8 mov %edi,%eax <--- move1
>> 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
>> 6: f7 d8 neg %eax
>> 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
>> c: 85 ff test %edi,%edi
>> e: 74 10 je 20 <test+0x20>
>> 10: e8 00 00 00 00 call 15 <test+0x15>
>> 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
>> 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
>> 1d: c3 ret
>> 1e: 66 90 xchg %ax,%ax
>> 20: e8 00 00 00 00 call 25 <test+0x25>
>> 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
>> 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
>> 2d: c3 ret
>>
>> This sequence really saves one move at the expense of of stack frame
>> allocation (which is not modelled by the cost model) and longer spill
>> code (also no modelled).
>> [...]
>> PR117082 is about noreturn function:
>> __attribute__ ((noreturn))
>> void
>> f3 (void)
>> {
>> int y0 = x0;
>> int y1 = x1;
>> f1 ();
>> f2 (y0, y1);
>> while (1);
>> }
>>
>> Here the cost model is really wrong by assuming that entry and exit
>> block have same frequencies. This can be fixed quite easilly (though it
>> is a rare case)
>
> Yeah (and nice example).
>
>> PR118497 seems to be ixed.
>>
>> So overall I think
>> 1) we can fix scaling of epilogue by exit block frequency
>> to get noreturns right.
>> 2) we should drop the check for optimize_size. Since with -Os
>> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
>> same way
>> 3) we currently have wire in "-1" to biass the cost metric for callee
>> saved registers.
>> It may make sense to allow targets to control this, since i.e. x86
>> has push/pop that is shorter. -3 would solve the testcase with neg
>> and would express that push/pop is still cheaper with extra reg-reg
>> move.
>> 4) cost model misses shring wrapping, the fact that if register is
>> callee saved it may be used by multiple allocnos and also that
>> push/pop sequence may avoid need for manual RSP adjustments.
>>
>> Those seems bit harder things to fit in though.
>>
>> So if we want to go with the target hook, I think it should adjust the
>> cost before scalling (since targets may have special tricks for
>> prologues) rather than the scale factor (which is target independent
>> part of cost model).
>
> Like you say, one of the missing pieces appears to be the allocation/
> dealloaction overhead for callee saves. Could we try to add a hook to
> model that cost, based on certain parameters?
>
> In particular, one thing that the examples above have in common is that
> they don't need to allocate a frame for local variables. That seems
> like it ought to be part of the mix. If we need to allocate a frame
> using addition anyway, then presumably one of the advantages of push/pop
> over callee saves goes away.
>
> But going back to my question above, modelling the allocation and
> deallocation would need to be done in a way that makes them more
> expensive than moves. I think in some cases we've assumed that
> addition and moves have equal cost, even when optimising for speed.
>
> In other words, rather than add a hook to tweak the -1 bias (which I
> still agree is a more reasonable thing to do than bypassing the IRA
> code altogether), could we add a separate hook for the allocation/
> deallocation and leave IRA to work out when to apply it?
>
> I suppose for -fno-omit-frame-pointer we should also take the creation
> of the frame link into account, if we don't already. This matters on
> aarch64 since -fomit-frame-pointer is not the default at -O2.
>
> One quirk on aarch64 is that, if we're using an odd number of
> callee-saved GPRs, adding one more is essentially free, since we would
> allocate space for it anyway, and could save and restore it alongside
> the odd one out. That would probably be difficult to apply in practice
> though. And it's obviously a separate issue from the current one.
> Just mentioning it as another thing we could model in future.
FWIW, here's a very rough initial version of the kind of thing
I was thinking about. Hopefully the hook documentation describes
the approach. It's deliberately (overly?) flexible.
I've included an aarch64 version that (a) models the fact that the
first caller-save can also allocate the frame more-or-less for free,
and (b) once we've saved an odd number of GPRs, saving one more is
essentialy free. I also hacked up an x86 version locally to model
the allocation benefits of using caller-saved registers. It seemed
to fix the povray example above.
This still needs a lot of clean-up and testing, but I thought I might
as well send what I have before leaving for the weekend. Does it look
reasonable in principle?
Thanks,
Richard
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c1e40200806..3f9453a74fb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15870,6 +15870,60 @@ aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
: base + aarch64_tune_params.memmov_cost.store_int);
}
+/* CALLEE_SAVED_REGS is the set of callee-saved registers that the
+ RA has already decided to use. Return the total number of GPRs
+ that need to be saved and restored, including the frame link registers. */
+static int
+aarch64_count_gpr_saves (const HARD_REG_SET &callee_saved_regs)
+{
+ auto saved_gprs = callee_saved_regs & reg_class_contents[GENERAL_REGS];
+ /* FIXME: popcount hack. */
+ auto nregs = popcount_hwi (saved_gprs.elts[0]);
+
+ if (aarch64_needs_frame_chain ())
+ nregs += 2;
+ else if (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM))
+ nregs += 1;
+ return nregs;
+}
+
+/* Implement TARGET_CALLEE_SAVE_COST. */
+static int
+aarch64_callee_save_cost (spill_cost_type, unsigned int regno,
+ machine_mode, unsigned int nregs, int mem_cost,
+ const HARD_REG_SET &callee_saved_regs,
+ bool existing_spill_p)
+{
+ auto ngprs = aarch64_count_gpr_saves (callee_saved_regs);
+
+ /* FIXME: handle FP and Advanced SIMD registers. */
+
+ /* If we've already committed to saving an odd number of GPRs, assume that
+ saving one more will involve turning an STR into an STP. */
+ if (GP_REGNUM_P (regno) && nregs == 1 && (ngprs & 1))
+ return 0;
+
+ /* If this would be the first register that we save, add the cost of
+ allocating or deallocating the frame. For GPR saves, the allocation
+ and deallocation can be folded into the save and restore. */
+ if (!existing_spill_p
+ && ngprs == 0
+ && !GP_REGNUM_P (regno))
+ mem_cost += 2;
+
+ return mem_cost;
+}
+
+/* Implement TARGET_FRAME_ALLOCATION_COST. */
+static int
+aarch64_frame_allocation_cost (frame_cost_type,
+ const HARD_REG_SET &callee_saved_regs)
+{
+ if (aarch64_count_gpr_saves (callee_saved_regs) == 0)
+ return 2;
+ return 0;
+}
+
/* Implement TARGET_INSN_COST. We have the opportunity to do something
much more productive here, such as using insn attributes to cost things.
But we don't, not yet.
@@ -31572,6 +31626,12 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_MEMORY_MOVE_COST
#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
+#undef TARGET_CALLEE_SAVE_COST
+#define TARGET_CALLEE_SAVE_COST aarch64_callee_save_cost
+
+#undef TARGET_FRAME_ALLOCATION_COST
+#define TARGET_FRAME_ALLOCATION_COST aarch64_frame_allocation_cost
+
#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 0de24eda6f0..171bb8c0b77 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7003,6 +7003,74 @@ value to the result of that function. The arguments to that function
are the same as to this target hook.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_CALLEE_SAVE_COST (spill_cost_type @var{cost_type}, unsigned int @var{hard_regno}, machine_mode @var{mode}, unsigned int @var{nregs}, int @var{mem_cost}, const HARD_REG_SET @var{&allocated_callee_regs}, bool @var{existing_spills_p})
+Return the one-off cost of saving or restoring callee-saved registers
+(also known as call-preserved register or non-volatile registers).
+The parameters are as follows:
+
+@itemize
+@item
+@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register
+and @samp{spill_cost_type::RESTORE} for restoring a register.
+
+@item
+@var{hard_regno} and @var{mode} represent the whole register that
+the register allocator is considering using; of these,
+@var{nregs} registers are fully or partially callee-saved.
+
+@item
+@var{mem_cost} is the normal cost for storing (for saves)
+or loading (for restores) the @var{nregs} registers.
+
+@item
+@var{allocated_callee_regs} is the set of callee-saved registers
+that are already in use.
+
+@item
+@var{existing_spills_p} is true if the register allocator has
+already decided to spill registers to memory.
+@end itemize
+
+If @var{existing_spills_p} is false, the cost of a save should account
+for frame allocations in a way that is consistent with
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for
+caller-saved registers. Similarly, the cost of a restore should then
+account for frame deallocations in a way that is consistent with
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.
+
+Note that this hook should not attempt to apply a frequency scale
+to the cost: it is the caller's responsibility to do that where
+appropriate.
+
+The default implementation returns @var{mem_cost}, plus the allocation
+or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},
+where appropriate.
+@end deftypefn
+
+@deftypefn {Target Hook} int TARGET_FRAME_ALLOCATION_COST (frame_cost_type @var{cost_type}, const HARD_REG_SET @var{&allocated_callee_regs})
+Return the cost of allocating or deallocating a frame for the sake of
+using caller-saved registers; @var{cost_type} chooses between allocation
+and deallocation.
+
+This hook is only called if the register allocator has not so far
+decided to spill any pseudo registers to memory or to use caller-saved
+registers for values that are live across a call. It may have
+decided to use callee-saved registers; if so, @var{allocated_callee_regs}
+is the set of callee-saved registers it has used. There might also be
+other reasons why a stack frame is already needed; for example,
+@samp{get_frame_size ()} may be nonzero, or the target might already
+require a frame for target-specific reasons.
+
+When the register allocator uses this hook to cost caller-saved registers,
+it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved
+registers, passing @samp{false} as the @var{existing_spills_p} argument.
+The intention is to allow the target to apply an apples-for-apples
+comparison between callee-saved and caller-saved registers when the
+allocator has not yet committed to using registers of both types.
+
+The default implementation returns 0.
+@end deftypefn
+
@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
A C expression for the cost of a branch instruction. A value of 1 is
the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 631d04131e3..eccc4d88493 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4582,6 +4582,10 @@ These macros are obsolete, new ports should use the target hook
@hook TARGET_MEMORY_MOVE_COST
+@hook TARGET_CALLEE_SAVE_COST
+
+@hook TARGET_FRAME_ALLOCATION_COST
+
@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
A C expression for the cost of a branch instruction. A value of 1 is
the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 0699b349a1a..9aaac794f35 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -1199,6 +1199,9 @@ finish_update_cost_records (void)
register was already allocated for an allocno. */
static bool allocated_hardreg_p[FIRST_PSEUDO_REGISTER];
+/* Which callee-saved hard registers we've decided to save. */
+static HARD_REG_SET allocated_callee_save_regs;
+
/* Describes one element in a queue of allocnos whose costs need to be
updated. Each allocno in the queue is known to have an allocno
class. */
@@ -1740,6 +1743,20 @@ check_hard_reg_p (ira_allocno_t a, int hard_regno,
return j == nregs;
}
+/* Record that we have allocated NREGS registers starting at HARD_REGNO. */
+
+static void
+record_allocation (int hard_regno, int nregs)
+{
+ for (int i = 0; i < nregs; ++i)
+ if (!allocated_hardreg_p[hard_regno + i])
+ {
+ allocated_hardreg_p[hard_regno + i] = true;
+ if (!crtl->abi->clobbers_full_reg_p (hard_regno + i))
+ SET_HARD_REG_BIT (allocated_callee_save_regs, hard_regno + i);
+ }
+}
+
/* Return number of registers needed to be saved and restored at
function prologue/epilogue if we allocate HARD_REGNO to hold value
of MODE. */
@@ -1961,6 +1978,14 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
#endif
auto_bitmap allocnos_to_spill;
HARD_REG_SET soft_conflict_regs = {};
+ int entry_freq = REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ int exit_freq = REG_FREQ_FROM_BB (EXIT_BLOCK_PTR_FOR_FN (cfun));
+ /* Whether we have spilled pseudos or used caller-saved registers
+ for values that are live across a call.
+
+ FIXME: also check for pseudos that prefer NO_REGS and not equivalences.
+ Also account for regular spills. */
+ bool existing_spills_p = caller_save_needed;
ira_assert (! ALLOCNO_ASSIGNED_P (a));
get_conflict_and_start_profitable_regs (a, retry_p,
@@ -2175,17 +2200,45 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
/* We need to save/restore the hard register in
epilogue/prologue. Therefore we increase the cost. */
{
+ int nregs = hard_regno_nregs (hard_regno, mode);
+ add_cost = 0;
rclass = REGNO_REG_CLASS (hard_regno);
- add_cost = ((ira_memory_move_cost[mode][rclass][0]
- + ira_memory_move_cost[mode][rclass][1])
- * saved_nregs / hard_regno_nregs (hard_regno,
- mode) - 1)
- * (optimize_size ? 1 :
- REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+
+ auto entry_cost = targetm.callee_save_cost
+ (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
+ ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
+ allocated_callee_save_regs, existing_spills_p);
+ add_cost += entry_cost * entry_freq;
+
+ auto exit_cost = targetm.callee_save_cost
+ (spill_cost_type::RESTORE, hard_regno, mode, saved_nregs,
+ ira_memory_move_cost[mode][rclass][1] * saved_nregs / nregs,
+ allocated_callee_save_regs, existing_spills_p);
+ add_cost += exit_cost * exit_freq;
+
+ /* In the event of a tie between caller-save and callee-save,
+ prefer callee-save. */
+ add_cost -= 1;
+
cost += add_cost;
full_cost += add_cost;
}
}
+ if (!existing_spills_p && ira_need_caller_save_p (a, hard_regno))
+ {
+ add_cost = 0;
+
+ auto entry_cost = targetm.frame_allocation_cost
+ (frame_cost_type::ALLOCATION, allocated_callee_save_regs);
+ add_cost += entry_cost * entry_freq;
+
+ auto exit_cost = targetm.frame_allocation_cost
+ (frame_cost_type::DEALLOCATION, allocated_callee_save_regs);
+ add_cost += exit_cost * exit_freq;
+
+ cost += add_cost;
+ full_cost += add_cost;
+ }
if (min_cost > cost)
min_cost = cost;
if (min_full_cost > full_cost)
@@ -2212,8 +2265,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
fail:
if (best_hard_regno >= 0)
{
- for (i = hard_regno_nregs (best_hard_regno, mode) - 1; i >= 0; i--)
- allocated_hardreg_p[best_hard_regno + i] = true;
+ record_allocation (best_hard_regno,
+ hard_regno_nregs (best_hard_regno, mode));
spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs,
best_hard_regno);
}
@@ -3369,8 +3422,7 @@ improve_allocation (void)
/* Assign the best chosen hard register to A. */
ALLOCNO_HARD_REGNO (a) = best;
- for (j = nregs - 1; j >= 0; j--)
- allocated_hardreg_p[best + j] = true;
+ record_allocation (best, nregs);
if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
fprintf (ira_dump_file, "Assigning %d to a%dr%d\n",
diff --git a/gcc/target.def b/gcc/target.def
index 4050b2ebdd4..fdd80c06c77 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3775,6 +3775,80 @@ are the same as to this target hook.",
int, (machine_mode mode, reg_class_t rclass, bool in),
default_memory_move_cost)
+DEFHOOK
+(callee_save_cost,
+ "Return the one-off cost of saving or restoring callee-saved registers\n\
+(also known as call-preserved register or non-volatile registers).\n\
+The parameters are as follows:\n\
+\n\
+@itemize\n\
+@item\n\
+@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register\n\
+and @samp{spill_cost_type::RESTORE} for restoring a register.\n\
+\n\
+@item\n\
+@var{hard_regno} and @var{mode} represent the whole register that\n\
+the register allocator is considering using; of these,\n\
+@var{nregs} registers are fully or partially callee-saved.\n\
+\n\
+@item\n\
+@var{mem_cost} is the normal cost for storing (for saves)\n\
+or loading (for restores) the @var{nregs} registers.\n\
+\n\
+@item\n\
+@var{allocated_callee_regs} is the set of callee-saved registers\n\
+that are already in use.\n\
+\n\
+@item\n\
+@var{existing_spills_p} is true if the register allocator has\n\
+already decided to spill registers to memory.\n\
+@end itemize\n\
+\n\
+If @var{existing_spills_p} is false, the cost of a save should account\n\
+for frame allocations in a way that is consistent with\n\
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for\n\
+caller-saved registers. Similarly, the cost of a restore should then\n\
+account for frame deallocations in a way that is consistent with\n\
+@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.\n\
+\n\
+Note that this hook should not attempt to apply a frequency scale\n\
+to the cost: it is the caller's responsibility to do that where\n\
+appropriate.\n\
+\n\
+The default implementation returns @var{mem_cost}, plus the allocation\n\
+or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},\n\
+where appropriate.",
+ int, (spill_cost_type cost_type, unsigned int hard_regno,
+ machine_mode mode, unsigned int nregs, int mem_cost,
+ const HARD_REG_SET &allocated_callee_regs, bool existing_spills_p),
+ default_callee_save_cost)
+
+DEFHOOK
+(frame_allocation_cost,
+ "Return the cost of allocating or deallocating a frame for the sake of\n\
+using caller-saved registers; @var{cost_type} chooses between allocation\n\
+and deallocation.\n\
+\n\
+This hook is only called if the register allocator has not so far\n\
+decided to spill any pseudo registers to memory or to use caller-saved\n\
+registers for values that are live across a call. It may have\n\
+decided to use callee-saved registers; if so, @var{allocated_callee_regs}\n\
+is the set of callee-saved registers it has used. There might also be\n\
+other reasons why a stack frame is already needed; for example,\n\
+@samp{get_frame_size ()} may be nonzero, or the target might already\n\
+require a frame for target-specific reasons.\n\
+\n\
+When the register allocator uses this hook to cost caller-saved registers,\n\
+it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved\n\
+registers, passing @samp{false} as the @var{existing_spills_p} argument.\n\
+The intention is to allow the target to apply an apples-for-apples\n\
+comparison between callee-saved and caller-saved registers when the\n\
+allocator has not yet committed to using registers of both types.\n\
+\n\
+The default implementation returns 0.",
+ int, (frame_cost_type cost_type, const HARD_REG_SET &allocated_callee_regs),
+ default_frame_allocation_cost)
+
DEFHOOK
(use_by_pieces_infrastructure_p,
"GCC will attempt several strategies when asked to copy between\n\
diff --git a/gcc/target.h b/gcc/target.h
index 3e1ee68a341..2bf35e2d0ee 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -284,6 +284,18 @@ enum poly_value_estimate_kind
POLY_VALUE_LIKELY
};
+enum class spill_cost_type
+{
+ SAVE,
+ RESTORE
+};
+
+enum class frame_cost_type
+{
+ ALLOCATION,
+ DEALLOCATION
+};
+
typedef void (*emit_support_tinfos_callback) (tree);
extern bool verify_type_context (location_t, type_context_kind, const_tree,
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index f80dc8b2e7e..a8331ba5576 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2075,6 +2075,29 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
#endif
}
+int
+default_callee_save_cost (spill_cost_type spill_type, unsigned int,
+ machine_mode, unsigned int, int mem_cost,
+ const HARD_REG_SET &callee_saved_regs,
+ bool existing_spills_p)
+{
+ if (!existing_spills_p)
+ {
+ auto frame_type = (spill_type == spill_cost_type::SAVE
+ ? frame_cost_type::ALLOCATION
+ : frame_cost_type::DEALLOCATION);
+ mem_cost += targetm.frame_allocation_cost (frame_type,
+ callee_saved_regs);
+ }
+ return mem_cost;
+}
+
+int
+default_frame_allocation_cost (frame_cost_type, const HARD_REG_SET &)
+{
+ return 0;
+}
+
/* The default implementation of TARGET_SLOW_UNALIGNED_ACCESS. */
bool
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 7d15f632c23..0c4c27e257e 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -234,6 +234,11 @@ extern tree default_builtin_tm_load_store (tree);
extern int default_memory_move_cost (machine_mode, reg_class_t, bool);
extern int default_register_move_cost (machine_mode, reg_class_t,
reg_class_t);
+extern int default_callee_save_cost (spill_cost_type, unsigned int,
+ machine_mode, unsigned int, int,
+ const HARD_REG_SET &, bool);
+extern int default_frame_allocation_cost (frame_cost_type,
+ const HARD_REG_SET &);
extern bool default_slow_unaligned_access (machine_mode, unsigned int);
extern HOST_WIDE_INT default_estimated_poly_value (poly_int64,
poly_value_estimate_kind);
On 2/6/25 5:35 PM, Jan Hubicka wrote:
>
> Register 3 (first caller saved) has cost 11000. This comes from:
> add_cost = ((ira_memory_move_cost[mode][rclass][0]
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> ^^
> here
> * (optimize_size ? 1 :
> REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
>
> There is no comment why -1, but I suppose it is there to biass costs to
> use prologue/epilogue instad of caller save sequence when runtime cost
> estimate is even.
It is a very old code. As I remember RA w/o this avoided to use a
callee-saved reg at all in cases when it could be used for more one
pseudo. The idea was also that some targets typical savings/restores
are cheaper (pop/push, multiple reg lds/sts) and it should be taken into
account somehow.
>
>
> 0000000000000000 <test>:
> 0: 53 push %rbx <--- callee save
> 1: 89 fb mov %edi,%ebx <--- move 1
> 3: f7 db neg %ebx
> 5: 74 09 je 10 <test+0x10>
> 7: e8 00 00 00 00 call c <test+0xc>
> c: 89 d8 mov %ebx,%eax <--- callee restore
> e: 5b pop %rbx
> f: c3 ret
> 10: e8 00 00 00 00 call 15 <test+0x15>
> 15: 89 d8 mov %ebx,%eax <--- move 2
> 17: 5b pop %rbx <--- callee restore
> 18: c3 ret
>
> Mainline used EAX since it has costs 13000. It is not 100% clear to me
> why.
In many cases it is hard to find why this particular cost occurs as the
costs are updated dynamically and assignment a pseudo to hard reg can
affect cost for another pseudo which is not involved in a move with the
first pseudo (but involved through a chain of moves). Those are
complicated heuristics changed several times and verified by visible
SPEC performance improvements.
>
> So overall I think
> 1) we can fix scaling of epilogue by exit block frequency
> to get noreturns right.
> 2) we should drop the check for optimize_size. Since with -Os
> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
> same way
> 3) we currently have wire in "-1" to biass the cost metric for callee
> saved registers.
> It may make sense to allow targets to control this, since i.e. x86
> has push/pop that is shorter. -3 would solve the testcase with neg
> and would express that push/pop is still cheaper with extra reg-reg
> move.
> 4) cost model misses shring wrapping, the fact that if register is
> callee saved it may be used by multiple allocnos and also that
> push/pop sequence may avoid need for manual RSP adjustments.
Shrink wrapping was later addition to RA and I guess nobody thought how
to update cost model taking it into account.
> Those seems bit harder things to fit in though.
>
> So if we want to go with the target hook, I think it should adjust the
> cost before scalling (since targets may have special tricks for
> prologues) rather than the scale factor (which is target independent
> part of cost model).
Very nice analysis, Honza. I believe we still need a hook and I'll work
on the target hook improvement.
On Fri, Feb 7, 2025 at 9:20 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Sandiford <richard.sandiford@arm.com> writes:
> > Really nice analysis! Thanks for writing this up.
> >
> > Sorry for the big quote below, but:
> >
> > Jan Hubicka <hubicka@ucw.cz> writes:
> >> [...]
> >> PR117081 is about regression in povray. The reducted testcase:
> >>
> >> void foo (void);
> >> void bar (void);
> >>
> >> int
> >> test (int a)
> >> {
> >> int r;
> >>
> >> if (r = -a)
> >> foo ();
> >> else
> >> bar ();
> >>
> >> return r;
> >> }
> >>
> >> shows that we now use caller saved register (EAX) to hold the return value which yields longer code. The costs are
> >> Popping a0(r98,l0) -- (0=13000,13000) (3=15000,15000) (6=15000,15000) (40=15000,15000) (41=15000,15000) (42=15000,15000) (43=15000,15000)
> >>
> >> here 15000 is 11000+4000 where I think 4000 is cost of 2 reg-reg moves
> >> multiplied by REG_FREQ_MAX. This seems correct. GCC 13 uses callee
> >> saved register and produces:
> >>
> >> 0000000000000000 <test>:
> >> 0: 53 push %rbx <--- callee save
> >> 1: 89 fb mov %edi,%ebx <--- move 1
> >> 3: f7 db neg %ebx
> >> 5: 74 09 je 10 <test+0x10>
> >> 7: e8 00 00 00 00 call c <test+0xc>
> >> c: 89 d8 mov %ebx,%eax <--- callee restore
> >> e: 5b pop %rbx
> >> f: c3 ret
> >> 10: e8 00 00 00 00 call 15 <test+0x15>
> >> 15: 89 d8 mov %ebx,%eax <--- move 2
> >> 17: 5b pop %rbx <--- callee restore
> >> 18: c3 ret
> >>
> >> Mainline used EAX since it has costs 13000. It is not 100% clear to me
> >> why.
> >> - 12000 is the spilling (which is emitted twice but executed just once)
> >> - I would have expected 2000 for the move from edi to eax.
> >> However even if cost is 14000 we will choose EAX. The code is:
> >>
> >> 0: 89 f8 mov %edi,%eax <--- move1
> >> 2: 48 83 ec 18 sub $0x18,%rsp <--- stack frame creation
> >> 6: f7 d8 neg %eax
> >> 8: 89 44 24 0c mov %eax,0xc(%rsp) <--- spill out
> >> c: 85 ff test %edi,%edi
> >> e: 74 10 je 20 <test+0x20>
> >> 10: e8 00 00 00 00 call 15 <test+0x15>
> >> 15: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> >> 19: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> >> 1d: c3 ret
> >> 1e: 66 90 xchg %ax,%ax
> >> 20: e8 00 00 00 00 call 25 <test+0x25>
> >> 25: 8b 44 24 0c mov 0xc(%rsp),%eax <--- spill in
> >> 29: 48 83 c4 18 add $0x18,%rsp <--- stack frame
> >> 2d: c3 ret
> >>
> >> This sequence really saves one move at the expense of of stack frame
> >> allocation (which is not modelled by the cost model) and longer spill
> >> code (also no modelled).
> >> [...]
> >> PR117082 is about noreturn function:
> >> __attribute__ ((noreturn))
> >> void
> >> f3 (void)
> >> {
> >> int y0 = x0;
> >> int y1 = x1;
> >> f1 ();
> >> f2 (y0, y1);
> >> while (1);
> >> }
> >>
> >> Here the cost model is really wrong by assuming that entry and exit
> >> block have same frequencies. This can be fixed quite easilly (though it
> >> is a rare case)
> >
> > Yeah (and nice example).
> >
> >> PR118497 seems to be ixed.
> >>
> >> So overall I think
> >> 1) we can fix scaling of epilogue by exit block frequency
> >> to get noreturns right.
> >> 2) we should drop the check for optimize_size. Since with -Os
> >> REG_FREQ_FROM_BB always returns 1000 everything should be scaled
> >> same way
> >> 3) we currently have wire in "-1" to biass the cost metric for callee
> >> saved registers.
> >> It may make sense to allow targets to control this, since i.e. x86
> >> has push/pop that is shorter. -3 would solve the testcase with neg
> >> and would express that push/pop is still cheaper with extra reg-reg
> >> move.
> >> 4) cost model misses shring wrapping, the fact that if register is
> >> callee saved it may be used by multiple allocnos and also that
> >> push/pop sequence may avoid need for manual RSP adjustments.
> >>
> >> Those seems bit harder things to fit in though.
> >>
> >> So if we want to go with the target hook, I think it should adjust the
> >> cost before scalling (since targets may have special tricks for
> >> prologues) rather than the scale factor (which is target independent
> >> part of cost model).
> >
> > Like you say, one of the missing pieces appears to be the allocation/
> > dealloaction overhead for callee saves. Could we try to add a hook to
> > model that cost, based on certain parameters?
> >
> > In particular, one thing that the examples above have in common is that
> > they don't need to allocate a frame for local variables. That seems
> > like it ought to be part of the mix. If we need to allocate a frame
> > using addition anyway, then presumably one of the advantages of push/pop
> > over callee saves goes away.
> >
> > But going back to my question above, modelling the allocation and
> > deallocation would need to be done in a way that makes them more
> > expensive than moves. I think in some cases we've assumed that
> > addition and moves have equal cost, even when optimising for speed.
> >
> > In other words, rather than add a hook to tweak the -1 bias (which I
> > still agree is a more reasonable thing to do than bypassing the IRA
> > code altogether), could we add a separate hook for the allocation/
> > deallocation and leave IRA to work out when to apply it?
> >
> > I suppose for -fno-omit-frame-pointer we should also take the creation
> > of the frame link into account, if we don't already. This matters on
> > aarch64 since -fomit-frame-pointer is not the default at -O2.
> >
> > One quirk on aarch64 is that, if we're using an odd number of
> > callee-saved GPRs, adding one more is essentially free, since we would
> > allocate space for it anyway, and could save and restore it alongside
> > the odd one out. That would probably be difficult to apply in practice
> > though. And it's obviously a separate issue from the current one.
> > Just mentioning it as another thing we could model in future.
>
> FWIW, here's a very rough initial version of the kind of thing
> I was thinking about. Hopefully the hook documentation describes
> the approach. It's deliberately (overly?) flexible.
>
> I've included an aarch64 version that (a) models the fact that the
> first caller-save can also allocate the frame more-or-less for free,
> and (b) once we've saved an odd number of GPRs, saving one more is
> essentialy free. I also hacked up an x86 version locally to model
> the allocation benefits of using caller-saved registers. It seemed
> to fix the povray example above.
Note the pair allocation for aarch64 was filed PR 117477 so it would
be useful to check if it fixes that and include the bug # in the
commit if it does.
Thanks,
Andrew
>
> This still needs a lot of clean-up and testing, but I thought I might
> as well send what I have before leaving for the weekend. Does it look
> reasonable in principle?
>
> Thanks,
> Richard
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c1e40200806..3f9453a74fb 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -15870,6 +15870,60 @@ aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
> : base + aarch64_tune_params.memmov_cost.store_int);
> }
>
> +/* CALLEE_SAVED_REGS is the set of callee-saved registers that the
> + RA has already decided to use. Return the total number of GPRs
> + that need to be saved and restored, including the frame link registers. */
> +static int
> +aarch64_count_gpr_saves (const HARD_REG_SET &callee_saved_regs)
> +{
> + auto saved_gprs = callee_saved_regs & reg_class_contents[GENERAL_REGS];
> + /* FIXME: popcount hack. */
> + auto nregs = popcount_hwi (saved_gprs.elts[0]);
> +
> + if (aarch64_needs_frame_chain ())
> + nregs += 2;
> + else if (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM))
> + nregs += 1;
> + return nregs;
> +}
> +
> +/* Implement TARGET_CALLEE_SAVE_COST. */
> +static int
> +aarch64_callee_save_cost (spill_cost_type, unsigned int regno,
> + machine_mode, unsigned int nregs, int mem_cost,
> + const HARD_REG_SET &callee_saved_regs,
> + bool existing_spill_p)
> +{
> + auto ngprs = aarch64_count_gpr_saves (callee_saved_regs);
> +
> + /* FIXME: handle FP and Advanced SIMD registers. */
> +
> + /* If we've already committed to saving an odd number of GPRs, assume that
> + saving one more will involve turning an STR into an STP. */
> + if (GP_REGNUM_P (regno) && nregs == 1 && (ngprs & 1))
> + return 0;
> +
> + /* If this would be the first register that we save, add the cost of
> + allocating or deallocating the frame. For GPR saves, the allocation
> + and deallocation can be folded into the save and restore. */
> + if (!existing_spill_p
> + && ngprs == 0
> + && !GP_REGNUM_P (regno))
> + mem_cost += 2;
> +
> + return mem_cost;
> +}
> +
> +/* Implement TARGET_FRAME_ALLOCATION_COST. */
> +static int
> +aarch64_frame_allocation_cost (frame_cost_type,
> + const HARD_REG_SET &callee_saved_regs)
> +{
> + if (aarch64_count_gpr_saves (callee_saved_regs) == 0)
> + return 2;
> + return 0;
> +}
> +
> /* Implement TARGET_INSN_COST. We have the opportunity to do something
> much more productive here, such as using insn attributes to cost things.
> But we don't, not yet.
> @@ -31572,6 +31626,12 @@ aarch64_libgcc_floating_mode_supported_p
> #undef TARGET_MEMORY_MOVE_COST
> #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
>
> +#undef TARGET_CALLEE_SAVE_COST
> +#define TARGET_CALLEE_SAVE_COST aarch64_callee_save_cost
> +
> +#undef TARGET_FRAME_ALLOCATION_COST
> +#define TARGET_FRAME_ALLOCATION_COST aarch64_frame_allocation_cost
> +
> #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
> #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..171bb8c0b77 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -7003,6 +7003,74 @@ value to the result of that function. The arguments to that function
> are the same as to this target hook.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_CALLEE_SAVE_COST (spill_cost_type @var{cost_type}, unsigned int @var{hard_regno}, machine_mode @var{mode}, unsigned int @var{nregs}, int @var{mem_cost}, const HARD_REG_SET @var{&allocated_callee_regs}, bool @var{existing_spills_p})
> +Return the one-off cost of saving or restoring callee-saved registers
> +(also known as call-preserved register or non-volatile registers).
> +The parameters are as follows:
> +
> +@itemize
> +@item
> +@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register
> +and @samp{spill_cost_type::RESTORE} for restoring a register.
> +
> +@item
> +@var{hard_regno} and @var{mode} represent the whole register that
> +the register allocator is considering using; of these,
> +@var{nregs} registers are fully or partially callee-saved.
> +
> +@item
> +@var{mem_cost} is the normal cost for storing (for saves)
> +or loading (for restores) the @var{nregs} registers.
> +
> +@item
> +@var{allocated_callee_regs} is the set of callee-saved registers
> +that are already in use.
> +
> +@item
> +@var{existing_spills_p} is true if the register allocator has
> +already decided to spill registers to memory.
> +@end itemize
> +
> +If @var{existing_spills_p} is false, the cost of a save should account
> +for frame allocations in a way that is consistent with
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for
> +caller-saved registers. Similarly, the cost of a restore should then
> +account for frame deallocations in a way that is consistent with
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.
> +
> +Note that this hook should not attempt to apply a frequency scale
> +to the cost: it is the caller's responsibility to do that where
> +appropriate.
> +
> +The default implementation returns @var{mem_cost}, plus the allocation
> +or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},
> +where appropriate.
> +@end deftypefn
> +
> +@deftypefn {Target Hook} int TARGET_FRAME_ALLOCATION_COST (frame_cost_type @var{cost_type}, const HARD_REG_SET @var{&allocated_callee_regs})
> +Return the cost of allocating or deallocating a frame for the sake of
> +using caller-saved registers; @var{cost_type} chooses between allocation
> +and deallocation.
> +
> +This hook is only called if the register allocator has not so far
> +decided to spill any pseudo registers to memory or to use caller-saved
> +registers for values that are live across a call. It may have
> +decided to use callee-saved registers; if so, @var{allocated_callee_regs}
> +is the set of callee-saved registers it has used. There might also be
> +other reasons why a stack frame is already needed; for example,
> +@samp{get_frame_size ()} may be nonzero, or the target might already
> +require a frame for target-specific reasons.
> +
> +When the register allocator uses this hook to cost caller-saved registers,
> +it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved
> +registers, passing @samp{false} as the @var{existing_spills_p} argument.
> +The intention is to allow the target to apply an apples-for-apples
> +comparison between callee-saved and caller-saved registers when the
> +allocator has not yet committed to using registers of both types.
> +
> +The default implementation returns 0.
> +@end deftypefn
> +
> @defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
> A C expression for the cost of a branch instruction. A value of 1 is
> the default; other values are interpreted relative to that. Parameter
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..eccc4d88493 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4582,6 +4582,10 @@ These macros are obsolete, new ports should use the target hook
>
> @hook TARGET_MEMORY_MOVE_COST
>
> +@hook TARGET_CALLEE_SAVE_COST
> +
> +@hook TARGET_FRAME_ALLOCATION_COST
> +
> @defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
> A C expression for the cost of a branch instruction. A value of 1 is
> the default; other values are interpreted relative to that. Parameter
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..9aaac794f35 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -1199,6 +1199,9 @@ finish_update_cost_records (void)
> register was already allocated for an allocno. */
> static bool allocated_hardreg_p[FIRST_PSEUDO_REGISTER];
>
> +/* Which callee-saved hard registers we've decided to save. */
> +static HARD_REG_SET allocated_callee_save_regs;
> +
> /* Describes one element in a queue of allocnos whose costs need to be
> updated. Each allocno in the queue is known to have an allocno
> class. */
> @@ -1740,6 +1743,20 @@ check_hard_reg_p (ira_allocno_t a, int hard_regno,
> return j == nregs;
> }
>
> +/* Record that we have allocated NREGS registers starting at HARD_REGNO. */
> +
> +static void
> +record_allocation (int hard_regno, int nregs)
> +{
> + for (int i = 0; i < nregs; ++i)
> + if (!allocated_hardreg_p[hard_regno + i])
> + {
> + allocated_hardreg_p[hard_regno + i] = true;
> + if (!crtl->abi->clobbers_full_reg_p (hard_regno + i))
> + SET_HARD_REG_BIT (allocated_callee_save_regs, hard_regno + i);
> + }
> +}
> +
> /* Return number of registers needed to be saved and restored at
> function prologue/epilogue if we allocate HARD_REGNO to hold value
> of MODE. */
> @@ -1961,6 +1978,14 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> #endif
> auto_bitmap allocnos_to_spill;
> HARD_REG_SET soft_conflict_regs = {};
> + int entry_freq = REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun));
> + int exit_freq = REG_FREQ_FROM_BB (EXIT_BLOCK_PTR_FOR_FN (cfun));
> + /* Whether we have spilled pseudos or used caller-saved registers
> + for values that are live across a call.
> +
> + FIXME: also check for pseudos that prefer NO_REGS and not equivalences.
> + Also account for regular spills. */
> + bool existing_spills_p = caller_save_needed;
>
> ira_assert (! ALLOCNO_ASSIGNED_P (a));
> get_conflict_and_start_profitable_regs (a, retry_p,
> @@ -2175,17 +2200,45 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> /* We need to save/restore the hard register in
> epilogue/prologue. Therefore we increase the cost. */
> {
> + int nregs = hard_regno_nregs (hard_regno, mode);
> + add_cost = 0;
> rclass = REGNO_REG_CLASS (hard_regno);
> - add_cost = ((ira_memory_move_cost[mode][rclass][0]
> - + ira_memory_move_cost[mode][rclass][1])
> - * saved_nregs / hard_regno_nregs (hard_regno,
> - mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +
> + auto entry_cost = targetm.callee_save_cost
> + (spill_cost_type::SAVE, hard_regno, mode, saved_nregs,
> + ira_memory_move_cost[mode][rclass][0] * saved_nregs / nregs,
> + allocated_callee_save_regs, existing_spills_p);
> + add_cost += entry_cost * entry_freq;
> +
> + auto exit_cost = targetm.callee_save_cost
> + (spill_cost_type::RESTORE, hard_regno, mode, saved_nregs,
> + ira_memory_move_cost[mode][rclass][1] * saved_nregs / nregs,
> + allocated_callee_save_regs, existing_spills_p);
> + add_cost += exit_cost * exit_freq;
> +
> + /* In the event of a tie between caller-save and callee-save,
> + prefer callee-save. */
> + add_cost -= 1;
> +
> cost += add_cost;
> full_cost += add_cost;
> }
> }
> + if (!existing_spills_p && ira_need_caller_save_p (a, hard_regno))
> + {
> + add_cost = 0;
> +
> + auto entry_cost = targetm.frame_allocation_cost
> + (frame_cost_type::ALLOCATION, allocated_callee_save_regs);
> + add_cost += entry_cost * entry_freq;
> +
> + auto exit_cost = targetm.frame_allocation_cost
> + (frame_cost_type::DEALLOCATION, allocated_callee_save_regs);
> + add_cost += exit_cost * exit_freq;
> +
> + cost += add_cost;
> + full_cost += add_cost;
> + }
> if (min_cost > cost)
> min_cost = cost;
> if (min_full_cost > full_cost)
> @@ -2212,8 +2265,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> fail:
> if (best_hard_regno >= 0)
> {
> - for (i = hard_regno_nregs (best_hard_regno, mode) - 1; i >= 0; i--)
> - allocated_hardreg_p[best_hard_regno + i] = true;
> + record_allocation (best_hard_regno,
> + hard_regno_nregs (best_hard_regno, mode));
> spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs,
> best_hard_regno);
> }
> @@ -3369,8 +3422,7 @@ improve_allocation (void)
> /* Assign the best chosen hard register to A. */
> ALLOCNO_HARD_REGNO (a) = best;
>
> - for (j = nregs - 1; j >= 0; j--)
> - allocated_hardreg_p[best + j] = true;
> + record_allocation (best, nregs);
>
> if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
> fprintf (ira_dump_file, "Assigning %d to a%dr%d\n",
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..fdd80c06c77 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -3775,6 +3775,80 @@ are the same as to this target hook.",
> int, (machine_mode mode, reg_class_t rclass, bool in),
> default_memory_move_cost)
>
> +DEFHOOK
> +(callee_save_cost,
> + "Return the one-off cost of saving or restoring callee-saved registers\n\
> +(also known as call-preserved register or non-volatile registers).\n\
> +The parameters are as follows:\n\
> +\n\
> +@itemize\n\
> +@item\n\
> +@var{cost_type} is @samp{spill_cost_type::SAVE} for saving a register\n\
> +and @samp{spill_cost_type::RESTORE} for restoring a register.\n\
> +\n\
> +@item\n\
> +@var{hard_regno} and @var{mode} represent the whole register that\n\
> +the register allocator is considering using; of these,\n\
> +@var{nregs} registers are fully or partially callee-saved.\n\
> +\n\
> +@item\n\
> +@var{mem_cost} is the normal cost for storing (for saves)\n\
> +or loading (for restores) the @var{nregs} registers.\n\
> +\n\
> +@item\n\
> +@var{allocated_callee_regs} is the set of callee-saved registers\n\
> +that are already in use.\n\
> +\n\
> +@item\n\
> +@var{existing_spills_p} is true if the register allocator has\n\
> +already decided to spill registers to memory.\n\
> +@end itemize\n\
> +\n\
> +If @var{existing_spills_p} is false, the cost of a save should account\n\
> +for frame allocations in a way that is consistent with\n\
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of allocations for\n\
> +caller-saved registers. Similarly, the cost of a restore should then\n\
> +account for frame deallocations in a way that is consistent with\n\
> +@code{TARGET_FRAME_ALLOCATION_COST}'s handling of deallocations.\n\
> +\n\
> +Note that this hook should not attempt to apply a frequency scale\n\
> +to the cost: it is the caller's responsibility to do that where\n\
> +appropriate.\n\
> +\n\
> +The default implementation returns @var{mem_cost}, plus the allocation\n\
> +or deallocation cost returned by @code{TARGET_FRAME_ALLOCATION_COST},\n\
> +where appropriate.",
> + int, (spill_cost_type cost_type, unsigned int hard_regno,
> + machine_mode mode, unsigned int nregs, int mem_cost,
> + const HARD_REG_SET &allocated_callee_regs, bool existing_spills_p),
> + default_callee_save_cost)
> +
> +DEFHOOK
> +(frame_allocation_cost,
> + "Return the cost of allocating or deallocating a frame for the sake of\n\
> +using caller-saved registers; @var{cost_type} chooses between allocation\n\
> +and deallocation.\n\
> +\n\
> +This hook is only called if the register allocator has not so far\n\
> +decided to spill any pseudo registers to memory or to use caller-saved\n\
> +registers for values that are live across a call. It may have\n\
> +decided to use callee-saved registers; if so, @var{allocated_callee_regs}\n\
> +is the set of callee-saved registers it has used. There might also be\n\
> +other reasons why a stack frame is already needed; for example,\n\
> +@samp{get_frame_size ()} may be nonzero, or the target might already\n\
> +require a frame for target-specific reasons.\n\
> +\n\
> +When the register allocator uses this hook to cost caller-saved registers,\n\
> +it also uses @code{TARGET_CALLEE_SAVE_COST} to cost new callee-saved\n\
> +registers, passing @samp{false} as the @var{existing_spills_p} argument.\n\
> +The intention is to allow the target to apply an apples-for-apples\n\
> +comparison between callee-saved and caller-saved registers when the\n\
> +allocator has not yet committed to using registers of both types.\n\
> +\n\
> +The default implementation returns 0.",
> + int, (frame_cost_type cost_type, const HARD_REG_SET &allocated_callee_regs),
> + default_frame_allocation_cost)
> +
> DEFHOOK
> (use_by_pieces_infrastructure_p,
> "GCC will attempt several strategies when asked to copy between\n\
> diff --git a/gcc/target.h b/gcc/target.h
> index 3e1ee68a341..2bf35e2d0ee 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -284,6 +284,18 @@ enum poly_value_estimate_kind
> POLY_VALUE_LIKELY
> };
>
> +enum class spill_cost_type
> +{
> + SAVE,
> + RESTORE
> +};
> +
> +enum class frame_cost_type
> +{
> + ALLOCATION,
> + DEALLOCATION
> +};
> +
> typedef void (*emit_support_tinfos_callback) (tree);
>
> extern bool verify_type_context (location_t, type_context_kind, const_tree,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..a8331ba5576 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -2075,6 +2075,29 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
> #endif
> }
>
> +int
> +default_callee_save_cost (spill_cost_type spill_type, unsigned int,
> + machine_mode, unsigned int, int mem_cost,
> + const HARD_REG_SET &callee_saved_regs,
> + bool existing_spills_p)
> +{
> + if (!existing_spills_p)
> + {
> + auto frame_type = (spill_type == spill_cost_type::SAVE
> + ? frame_cost_type::ALLOCATION
> + : frame_cost_type::DEALLOCATION);
> + mem_cost += targetm.frame_allocation_cost (frame_type,
> + callee_saved_regs);
> + }
> + return mem_cost;
> +}
> +
> +int
> +default_frame_allocation_cost (frame_cost_type, const HARD_REG_SET &)
> +{
> + return 0;
> +}
> +
> /* The default implementation of TARGET_SLOW_UNALIGNED_ACCESS. */
>
> bool
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..0c4c27e257e 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -234,6 +234,11 @@ extern tree default_builtin_tm_load_store (tree);
> extern int default_memory_move_cost (machine_mode, reg_class_t, bool);
> extern int default_register_move_cost (machine_mode, reg_class_t,
> reg_class_t);
> +extern int default_callee_save_cost (spill_cost_type, unsigned int,
> + machine_mode, unsigned int, int,
> + const HARD_REG_SET &, bool);
> +extern int default_frame_allocation_cost (frame_cost_type,
> + const HARD_REG_SET &);
> extern bool default_slow_unaligned_access (machine_mode, unsigned int);
> extern HOST_WIDE_INT default_estimated_poly_value (poly_int64,
> poly_value_estimate_kind);
> --
> 2.25.1
>
Richard Sandiford <richard.sandiford@arm.com> writes:
> FWIW, here's a very rough initial version of the kind of thing
> I was thinking about. Hopefully the hook documentation describes
> the approach. It's deliberately (overly?) flexible.
Argh! I forgot:
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index de34be31f48..4b1e7eff41f 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -5253,6 +5253,7 @@ color (void)
{
allocno_stack_vec.create (ira_allocnos_num);
memset (allocated_hardreg_p, 0, sizeof (allocated_hardreg_p));
+ CLEAR_HARD_REG_SET (allocated_callee_save_regs);
ira_initiate_assign ();
do_coloring ();
ira_finish_assign ();
> This still needs a lot of clean-up and testing, but I thought I might
> as well send what I have before leaving for the weekend. Does it look
> reasonable in principle?
To be clear, "testing" of course includes performance testing
(SPEC, but alo other things).
Thanks,
Richard
On Sun, Feb 2, 2025 at 10:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> Date: Tue Jun 25 08:37:49 2024 -0500
>
> ira: Scale save/restore costs of callee save registers with block frequency
>
> scales the cost of saving/restoring a callee-save hard register in epilogue
> and prologue with the entry block frequency, which, if not optimizing for
> size, is 10000, for all targets. As the result, callee-saved registers
> may not be used to preserve local variable values across calls on some
> targets, like x86. Add a target hook for the callee-saved register cost
> scale in epilogue and prologue used by IRA. The default version of this
> target hook returns 1 if optimizing for size, otherwise returns the entry
> block frequency. Add an x86 version of this target hook to restore the
> old behavior prior to the above commit.
Hi,
As mentioned by others, this is not the right thing to do for x86_64.
Plus it sometimes causes shrink wrapping NOT to happen. See PR 110008
which I had to reopen as being broken by this.
Thanks,
Andrew
>
> PR rtl-optimization/111673
> PR rtl-optimization/115932
> PR rtl-optimization/116028
> PR rtl-optimization/117081
> PR rtl-optimization/117082
> PR rtl-optimization/118497
> * ira-color.cc (assign_hard_reg): Call the target hook for the
> callee-saved register cost scale in epilogue and prologue.
> * target.def (ira_callee_saved_register_cost_scale): New target
> hook.
> * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> New.
> * targhooks.h (default_ira_callee_saved_register_cost_scale):
> Likewise.
> * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> New.
> (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> * doc/tm.texi: Regenerated.
> * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> New.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386.cc | 11 +++++++++++
> gcc/doc/tm.texi | 8 ++++++++
> gcc/doc/tm.texi.in | 2 ++
> gcc/ira-color.cc | 3 +--
> gcc/target.def | 12 ++++++++++++
> gcc/targhooks.cc | 8 ++++++++
> gcc/targhooks.h | 1 +
> 7 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index f89201684a8..3128973ba79 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> return false;
> }
>
> +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> +
> +static int
> +ix86_ira_callee_saved_register_cost_scale (int)
> +{
> + return 1;
> +}
> +
> /* Return true if a set of DST by the expression SRC should be allowed.
> This prevents complex sets of likely_spilled hard regs before split1. */
>
> @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> #undef TARGET_CLASS_LIKELY_SPILLED_P
> #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> + ix86_ira_callee_saved_register_cost_scale
>
> #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 0de24eda6f0..9f42913a4ef 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> The default version of this target hook always returns given class.
> @end deftypefn
>
> +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> +A target hook which returns the callee-saved register @var{hard_regno}
> +cost scale in epilogue and prologue used by IRA.
> +
> +The default version of this target hook returns 1 if optimizing for
> +size, otherwise returns the entry block frequency.
> +@end deftypefn
> +
> @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> A target hook which returns true if we use LRA instead of reload pass.
>
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 631d04131e3..6dbe22581ca 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -2388,6 +2388,8 @@ in the reload pass.
>
> @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
>
> +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> +
> @hook TARGET_LRA_P
>
> @hook TARGET_REGISTER_PRIORITY
> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 0699b349a1a..233060e1587 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> + ira_memory_move_cost[mode][rclass][1])
> * saved_nregs / hard_regno_nregs (hard_regno,
> mode) - 1)
> - * (optimize_size ? 1 :
> - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> cost += add_cost;
> full_cost += add_cost;
> }
> diff --git a/gcc/target.def b/gcc/target.def
> index 4050b2ebdd4..c348b15815a 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -5714,6 +5714,18 @@ DEFHOOK
> reg_class_t, (int, reg_class_t, reg_class_t),
> default_ira_change_pseudo_allocno_class)
>
> +/* Scale of callee-saved register cost in epilogue and prologue used by
> + IRA. */
> +DEFHOOK
> +(ira_callee_saved_register_cost_scale,
> + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> +cost scale in epilogue and prologue used by IRA.\n\
> +\n\
> +The default version of this target hook returns 1 if optimizing for\n\
> +size, otherwise returns the entry block frequency.",
> + int, (int hard_regno),
> + default_ira_callee_saved_register_cost_scale)
> +
> /* Return true if we use LRA instead of reload. */
> DEFHOOK
> (lra_p,
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index f80dc8b2e7e..344075efa41 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> return cl;
> }
>
> +int
> +default_ira_callee_saved_register_cost_scale (int)
> +{
> + return (optimize_size
> + ? 1
> + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> +}
> +
> extern bool
> default_lra_p (void)
> {
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 7d15f632c23..8871e01430c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> reg_class_t);
> +extern int default_ira_callee_saved_register_cost_scale (int);
> extern bool default_lra_p (void);
> extern int default_register_priority (int);
> extern bool default_register_usage_leveling_p (void);
> --
> 2.48.1
>
On Mon, Feb 10, 2025 at 9:56 AM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Sun, Feb 2, 2025 at 10:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > commit 3b9b8d6cfdf59337f4b7ce10ce92a98044b2657b
> > Author: Surya Kumari Jangala <jskumari@linux.ibm.com>
> > Date: Tue Jun 25 08:37:49 2024 -0500
> >
> > ira: Scale save/restore costs of callee save registers with block frequency
> >
> > scales the cost of saving/restoring a callee-save hard register in epilogue
> > and prologue with the entry block frequency, which, if not optimizing for
> > size, is 10000, for all targets. As the result, callee-saved registers
> > may not be used to preserve local variable values across calls on some
> > targets, like x86. Add a target hook for the callee-saved register cost
> > scale in epilogue and prologue used by IRA. The default version of this
> > target hook returns 1 if optimizing for size, otherwise returns the entry
> > block frequency. Add an x86 version of this target hook to restore the
> > old behavior prior to the above commit.
>
>
> Hi,
> As mentioned by others, this is not the right thing to do for x86_64.
> Plus it sometimes causes shrink wrapping NOT to happen. See PR 110008
> which I had to reopen as being broken by this.
The testcase in
https://gcc.gnu.org/pipermail/gcc-patches/2025-January/673875.html
also fails now too for x86_64 and that is from real code unlike the
other testcases.
I would say revert the hook and fix the issue some other way.
Thanks,
Andrew
>
> Thanks,
> Andrew
>
>
> >
> > PR rtl-optimization/111673
> > PR rtl-optimization/115932
> > PR rtl-optimization/116028
> > PR rtl-optimization/117081
> > PR rtl-optimization/117082
> > PR rtl-optimization/118497
> > * ira-color.cc (assign_hard_reg): Call the target hook for the
> > callee-saved register cost scale in epilogue and prologue.
> > * target.def (ira_callee_saved_register_cost_scale): New target
> > hook.
> > * targhooks.cc (default_ira_callee_saved_register_cost_scale):
> > New.
> > * targhooks.h (default_ira_callee_saved_register_cost_scale):
> > Likewise.
> > * config/i386/i386.cc (ix86_ira_callee_saved_register_cost_scale):
> > New.
> > (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE): Likewise.
> > * doc/tm.texi: Regenerated.
> > * doc/tm.texi.in (TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE):
> > New.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > gcc/config/i386/i386.cc | 11 +++++++++++
> > gcc/doc/tm.texi | 8 ++++++++
> > gcc/doc/tm.texi.in | 2 ++
> > gcc/ira-color.cc | 3 +--
> > gcc/target.def | 12 ++++++++++++
> > gcc/targhooks.cc | 8 ++++++++
> > gcc/targhooks.h | 1 +
> > 7 files changed, 43 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index f89201684a8..3128973ba79 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
> > return false;
> > }
> >
> > +/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
> > +
> > +static int
> > +ix86_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return 1;
> > +}
> > +
> > /* Return true if a set of DST by the expression SRC should be allowed.
> > This prevents complex sets of likely_spilled hard regs before split1. */
> >
> > @@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
> > #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
> > #undef TARGET_CLASS_LIKELY_SPILLED_P
> > #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> > +#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> > + ix86_ira_callee_saved_register_cost_scale
> >
> > #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> > #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 0de24eda6f0..9f42913a4ef 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
> > The default version of this target hook always returns given class.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
> > +A target hook which returns the callee-saved register @var{hard_regno}
> > +cost scale in epilogue and prologue used by IRA.
> > +
> > +The default version of this target hook returns 1 if optimizing for
> > +size, otherwise returns the entry block frequency.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} bool TARGET_LRA_P (void)
> > A target hook which returns true if we use LRA instead of reload pass.
> >
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index 631d04131e3..6dbe22581ca 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -2388,6 +2388,8 @@ in the reload pass.
> >
> > @hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> >
> > +@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> > +
> > @hook TARGET_LRA_P
> >
> > @hook TARGET_REGISTER_PRIORITY
> > diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> > index 0699b349a1a..233060e1587 100644
> > --- a/gcc/ira-color.cc
> > +++ b/gcc/ira-color.cc
> > @@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
> > + ira_memory_move_cost[mode][rclass][1])
> > * saved_nregs / hard_regno_nregs (hard_regno,
> > mode) - 1)
> > - * (optimize_size ? 1 :
> > - REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > + * targetm.ira_callee_saved_register_cost_scale (hard_regno);
> > cost += add_cost;
> > full_cost += add_cost;
> > }
> > diff --git a/gcc/target.def b/gcc/target.def
> > index 4050b2ebdd4..c348b15815a 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -5714,6 +5714,18 @@ DEFHOOK
> > reg_class_t, (int, reg_class_t, reg_class_t),
> > default_ira_change_pseudo_allocno_class)
> >
> > +/* Scale of callee-saved register cost in epilogue and prologue used by
> > + IRA. */
> > +DEFHOOK
> > +(ira_callee_saved_register_cost_scale,
> > + "A target hook which returns the callee-saved register @var{hard_regno}\n\
> > +cost scale in epilogue and prologue used by IRA.\n\
> > +\n\
> > +The default version of this target hook returns 1 if optimizing for\n\
> > +size, otherwise returns the entry block frequency.",
> > + int, (int hard_regno),
> > + default_ira_callee_saved_register_cost_scale)
> > +
> > /* Return true if we use LRA instead of reload. */
> > DEFHOOK
> > (lra_p,
> > diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> > index f80dc8b2e7e..344075efa41 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
> > return cl;
> > }
> >
> > +int
> > +default_ira_callee_saved_register_cost_scale (int)
> > +{
> > + return (optimize_size
> > + ? 1
> > + : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
> > +}
> > +
> > extern bool
> > default_lra_p (void)
> > {
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> > index 7d15f632c23..8871e01430c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
> > extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
> > extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
> > reg_class_t);
> > +extern int default_ira_callee_saved_register_cost_scale (int);
> > extern bool default_lra_p (void);
> > extern int default_register_priority (int);
> > extern bool default_register_usage_leveling_p (void);
> > --
> > 2.48.1
> >
On Tue, Feb 11, 2025 at 4:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> > PR117081 is about regression in povray. The reducted testcase:
> Just for clarification. PR117081 is not about regression in povray.
> it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
> testl
> The pr91384.c is added by r12-7417 which is peephole optimization
> expecting some specific instruction sequence, the regression can be
> fixed by adding new peephole pattern.
>
> H.J patch actually regressed povray by introducing extra push/pops
> (since it adds preference for callee save registers, in the benchmark
> using caller saved registers is much better).
> Sorry, I may not have been clear in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
>
My patch doesn't change the codegen for that code as shown by
commit 846837c2406ae7a52d9123b29c13e4b8b9d14224
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Feb 7 13:49:30 2025 +0800
x86: Verify that PUSH/POP can be skipped
> PR117081 is about regression in povray. The reducted testcase:
Just for clarification. PR117081 is not about regression in povray.
it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
testl
The pr91384.c is added by r12-7417 which is peephole optimization
expecting some specific instruction sequence, the regression can be
fixed by adding new peephole pattern.
H.J patch actually regressed povray by introducing extra push/pops
(since it adds preference for callee save registers, in the benchmark
using caller saved registers is much better).
Sorry, I may not have been clear in
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
On Tue, Feb 11, 2025 at 4:38 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Feb 11, 2025 at 4:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Feb 11, 2025 at 4:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > > PR117081 is about regression in povray. The reducted testcase:
> > > Just for clarification. PR117081 is not about regression in povray.
> > > it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
> > > testl
> > > The pr91384.c is added by r12-7417 which is peephole optimization
> > > expecting some specific instruction sequence, the regression can be
> > > fixed by adding new peephole pattern.
> > >
> > > H.J patch actually regressed povray by introducing extra push/pops
> > > (since it adds preference for callee save registers, in the benchmark
> > > using caller saved registers is much better).
> > > Sorry, I may not have been clear in
> > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
> > >
> >
> > My patch doesn't change the codegen for that code as shown by
> Real benchmark scenarios are a little more complex, the testcase in
> the PR is just one of the scenes, but not all.
> We are currently investigating this case and hope to find a better solution.
We need testcases to make sure that there are no regressions.
> >
> > commit 846837c2406ae7a52d9123b29c13e4b8b9d14224
> > Author: H.J. Lu <hjl.tools@gmail.com>
> > Date: Fri Feb 7 13:49:30 2025 +0800
> >
> > x86: Verify that PUSH/POP can be skipped
> >
> >
> > --
> > H.J.
>
>
>
> --
> BR,
> Hongtao
On Tue, Feb 11, 2025 at 4:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Feb 11, 2025 at 4:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > > PR117081 is about regression in povray. The reducted testcase:
> > Just for clarification. PR117081 is not about regression in povray.
> > it's related to FAIL: gcc.target/i386/pr91384.c scan-assembler-not
> > testl
> > The pr91384.c is added by r12-7417 which is peephole optimization
> > expecting some specific instruction sequence, the regression can be
> > fixed by adding new peephole pattern.
> >
> > H.J patch actually regressed povray by introducing extra push/pops
> > (since it adds preference for callee save registers, in the benchmark
> > using caller saved registers is much better).
> > Sorry, I may not have been clear in
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081#c9
> >
>
> My patch doesn't change the codegen for that code as shown by
Real benchmark scenarios are a little more complex, the testcase in
the PR is just one of the scenes, but not all.
We are currently investigating this case and hope to find a better solution.
>
> commit 846837c2406ae7a52d9123b29c13e4b8b9d14224
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date: Fri Feb 7 13:49:30 2025 +0800
>
> x86: Verify that PUSH/POP can be skipped
>
>
> --
> H.J.
@@ -20600,6 +20600,14 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
return false;
}
+/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE. */
+
+static int
+ix86_ira_callee_saved_register_cost_scale (int)
+{
+ return 1;
+}
+
/* Return true if a set of DST by the expression SRC should be allowed.
This prevents complex sets of likely_spilled hard regs before split1. */
@@ -27078,6 +27086,9 @@ ix86_libgcc_floating_mode_supported_p
#define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
#undef TARGET_CLASS_LIKELY_SPILLED_P
#define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
+#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
+#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
+ ix86_ira_callee_saved_register_cost_scale
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
@@ -3047,6 +3047,14 @@ A target hook which can change allocno class for given pseudo from
The default version of this target hook always returns given class.
@end deftypefn
+@deftypefn {Target Hook} int TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE (int @var{hard_regno})
+A target hook which returns the callee-saved register @var{hard_regno}
+cost scale in epilogue and prologue used by IRA.
+
+The default version of this target hook returns 1 if optimizing for
+size, otherwise returns the entry block frequency.
+@end deftypefn
+
@deftypefn {Target Hook} bool TARGET_LRA_P (void)
A target hook which returns true if we use LRA instead of reload pass.
@@ -2388,6 +2388,8 @@ in the reload pass.
@hook TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
+@hook TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
+
@hook TARGET_LRA_P
@hook TARGET_REGISTER_PRIORITY
@@ -2180,8 +2180,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
+ ira_memory_move_cost[mode][rclass][1])
* saved_nregs / hard_regno_nregs (hard_regno,
mode) - 1)
- * (optimize_size ? 1 :
- REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+ * targetm.ira_callee_saved_register_cost_scale (hard_regno);
cost += add_cost;
full_cost += add_cost;
}
@@ -5714,6 +5714,18 @@ DEFHOOK
reg_class_t, (int, reg_class_t, reg_class_t),
default_ira_change_pseudo_allocno_class)
+/* Scale of callee-saved register cost in epilogue and prologue used by
+ IRA. */
+DEFHOOK
+(ira_callee_saved_register_cost_scale,
+ "A target hook which returns the callee-saved register @var{hard_regno}\n\
+cost scale in epilogue and prologue used by IRA.\n\
+\n\
+The default version of this target hook returns 1 if optimizing for\n\
+size, otherwise returns the entry block frequency.",
+ int, (int hard_regno),
+ default_ira_callee_saved_register_cost_scale)
+
/* Return true if we use LRA instead of reload. */
DEFHOOK
(lra_p,
@@ -1305,6 +1305,14 @@ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
return cl;
}
+int
+default_ira_callee_saved_register_cost_scale (int)
+{
+ return (optimize_size
+ ? 1
+ : REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+}
+
extern bool
default_lra_p (void)
{
@@ -173,6 +173,7 @@ extern void default_emit_call_builtin___clear_cache (rtx, rtx);
extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
reg_class_t);
+extern int default_ira_callee_saved_register_cost_scale (int);
extern bool default_lra_p (void);
extern int default_register_priority (int);
extern bool default_register_usage_leveling_p (void);