LoongArch: Enable shrink wrapping

Message ID 20230423131903.155998-1-xry111@xry111.site
State New
Headers
Series LoongArch: Enable shrink wrapping |

Commit Message

Xi Ruoyao April 23, 2023, 1:19 p.m. UTC
  This commit implements the target macros for shrink wrapping of function
prologues/epilogues shrink wrapping on LoongArch.

Bootstrapped and regtested on loongarch64-linux-gnu.  I don't have an
access to SPEC CPU so I hope the reviewer can perform a benchmark to see
if there is real benefit.

gcc/ChangeLog:

	* config/loongarch/loongarch.h (struct machine_function): Add
	reg_is_wrapped_separately array for register wrapping
	information.
	* config/loongarch/loongarch.cc
	(loongarch_get_separate_components): New function.
	(loongarch_components_for_bb): Likewise.
	(loongarch_disqualify_components): Likewise.
	(loongarch_process_components): Likewise.
	(loongarch_emit_prologue_components): Likewise.
	(loongarch_emit_epilogue_components): Likewise.
	(loongarch_set_handled_components): Likewise.
	(TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
	(TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
	(TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
	(TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
	(TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
	(TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
	(loongarch_for_each_saved_reg): Skip registers that are wrapped
	separately.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/shrink-wrap.c: New test.
---
 gcc/config/loongarch/loongarch.cc             | 179 +++++++++++++++++-
 gcc/config/loongarch/loongarch.h              |   2 +
 .../gcc.target/loongarch/shrink-wrap.c        |  22 +++
 3 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
  

Comments

Lulu Cheng April 24, 2023, 8:51 a.m. UTC | #1
Ok, I will do spec performance test comparison as soon as possible.

Thanks!

在 2023/4/23 下午9:19, Xi Ruoyao 写道:
> This commit implements the target macros for shrink wrapping of function
> prologues/epilogues shrink wrapping on LoongArch.
>
> Bootstrapped and regtested on loongarch64-linux-gnu.  I don't have an
> access to SPEC CPU so I hope the reviewer can perform a benchmark to see
> if there is real benefit.
>
> gcc/ChangeLog:
>
> 	* config/loongarch/loongarch.h (struct machine_function): Add
> 	reg_is_wrapped_separately array for register wrapping
> 	information.
> 	* config/loongarch/loongarch.cc
> 	(loongarch_get_separate_components): New function.
> 	(loongarch_components_for_bb): Likewise.
> 	(loongarch_disqualify_components): Likewise.
> 	(loongarch_process_components): Likewise.
> 	(loongarch_emit_prologue_components): Likewise.
> 	(loongarch_emit_epilogue_components): Likewise.
> 	(loongarch_set_handled_components): Likewise.
> 	(TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> 	(TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> 	(TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
> 	(loongarch_for_each_saved_reg): Skip registers that are wrapped
> 	separately.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/loongarch/shrink-wrap.c: New test.
> ---
>   gcc/config/loongarch/loongarch.cc             | 179 +++++++++++++++++-
>   gcc/config/loongarch/loongarch.h              |   2 +
>   .../gcc.target/loongarch/shrink-wrap.c        |  22 +++
>   3 files changed, 200 insertions(+), 3 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
>
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index e523fcb6b7f..d0024237a6a 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3.  If not see
>   #include "builtins.h"
>   #include "rtl-iter.h"
>   #include "opts.h"
> +#include "function-abi.h"
>   
>   /* This file should be included last.  */
>   #include "target-def.h"
> @@ -1017,19 +1018,23 @@ loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset,
>     for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
>       if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
>         {
> -	loongarch_save_restore_reg (word_mode, regno, offset, fn);
> +	if (!cfun->machine->reg_is_wrapped_separately[regno])
> +	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
> +
>   	offset -= UNITS_PER_WORD;
>         }
>   
>     /* This loop must iterate over the same space as its companion in
>        loongarch_compute_frame_info.  */
>     offset = cfun->machine->frame.fp_sp_offset - sp_offset;
> +  machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +
>     for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
>       if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
>         {
> -	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +	if (!cfun->machine->reg_is_wrapped_separately[regno])
> +	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
>   
> -	loongarch_save_restore_reg (mode, regno, offset, fn);
>   	offset -= GET_MODE_SIZE (mode);
>         }
>   }
> @@ -6644,6 +6649,151 @@ loongarch_asan_shadow_offset (void)
>     return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0;
>   }
>   
> +static sbitmap
> +loongarch_get_separate_components (void)
> +{
> +  HOST_WIDE_INT offset;
> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> +  bitmap_clear (components);
> +  offset = cfun->machine->frame.gp_sp_offset;
> +
> +  /* The stack should be aligned to 16-bytes boundary, so we can make the use
> +     of ldptr instructions.  */
> +  gcc_assert (offset % UNITS_PER_WORD == 0);
> +
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> +      {
> +	/* We can wrap general registers saved at [sp, sp + 32768) using the
> +	   ldptr/stptr instructions.  For large offsets a pseudo register
> +	   might be needed which cannot be created during the shrink
> +	   wrapping pass.
> +
> +	   TODO: This may need a revise when we add LA32 as ldptr.w is not
> +	   guaranteed available by the manual.  */
> +	if (offset < 32768)
> +	  bitmap_set_bit (components, regno);
> +
> +	offset -= UNITS_PER_WORD;
> +      }
> +
> +  offset = cfun->machine->frame.fp_sp_offset;
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> +      {
> +	/* We can only wrap FP registers with imm12 offsets.  For large
> +	   offsets a pseudo register might be needed which cannot be
> +	   created during the shrink wrapping pass.  */
> +	if (IMM12_OPERAND (offset))
> +	  bitmap_set_bit (components, regno);
> +
> +	offset -= UNITS_PER_FPREG;
> +      }
> +
> +  /* Don't mess with the hard frame pointer.  */
> +  if (frame_pointer_needed)
> +    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
> +
> +  bitmap_clear_bit (components, RETURN_ADDR_REGNUM);
> +
> +  return components;
> +}
> +
> +static sbitmap
> +loongarch_components_for_bb (basic_block bb)
> +{
> +  /* Registers are used in a bb if they are in the IN, GEN, or KILL sets.  */
> +  auto_bitmap used;
> +  bitmap_copy (used, DF_LIVE_IN (bb));
> +  bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->gen);
> +  bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->kill);
> +
> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> +  bitmap_clear (components);
> +
> +  function_abi_aggregator callee_abis;
> +  rtx_insn *insn;
> +  FOR_BB_INSNS (bb, insn)
> +    if (CALL_P (insn))
> +      callee_abis.note_callee_abi (insn_callee_abi (insn));
> +
> +  HARD_REG_SET extra_caller_saves =
> +    callee_abis.caller_save_regs (*crtl->abi);
> +
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (!fixed_regs[regno]
> +	&& !crtl->abi->clobbers_full_reg_p (regno)
> +	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno) ||
> +	    bitmap_bit_p (used, regno)))
> +      bitmap_set_bit (components, regno);
> +
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (!fixed_regs[regno]
> +	&& !crtl->abi->clobbers_full_reg_p (regno)
> +	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno) ||
> +	    bitmap_bit_p (used, regno)))
> +      bitmap_set_bit (components, regno);
> +
> +  return components;
> +}
> +
> +static void
> +loongarch_disqualify_components (sbitmap, edge, sbitmap, bool)
> +{
> +  /* Do nothing.  */
> +}
> +
> +static void
> +loongarch_process_components (sbitmap components, loongarch_save_restore_fn fn)
> +{
> +  HOST_WIDE_INT offset = cfun->machine->frame.gp_sp_offset;
> +
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> +      {
> +	if (bitmap_bit_p (components, regno))
> +	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
> +
> +	offset -= UNITS_PER_WORD;
> +      }
> +
> +  offset = cfun->machine->frame.fp_sp_offset;
> +  machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> +      {
> +	if (bitmap_bit_p (components, regno))
> +	  loongarch_save_restore_reg (mode, regno, offset, fn);
> +
> +	offset -= UNITS_PER_FPREG;
> +      }
> +}
> +
> +static void
> +loongarch_emit_prologue_components (sbitmap components)
> +{
> +  loongarch_process_components (components, loongarch_save_reg);
> +}
> +
> +static void
> +loongarch_emit_epilogue_components (sbitmap components)
> +{
> +  loongarch_process_components (components, loongarch_restore_reg);
> +}
> +
> +static void
> +loongarch_set_handled_components (sbitmap components)
> +{
> +    for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +      if (bitmap_bit_p (components, regno))
> +	cfun->machine->reg_is_wrapped_separately[regno] = true;
> +
> +    for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +      if (bitmap_bit_p (components, regno))
> +	cfun->machine->reg_is_wrapped_separately[regno] = true;
> +}
> +
>   /* Initialize the GCC target structure.  */
>   #undef TARGET_ASM_ALIGNED_HI_OP
>   #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -6841,6 +6991,29 @@ loongarch_asan_shadow_offset (void)
>   #undef TARGET_ASAN_SHADOW_OFFSET
>   #define TARGET_ASAN_SHADOW_OFFSET loongarch_asan_shadow_offset
>   
> +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
> +  loongarch_get_separate_components
> +
> +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
> +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB loongarch_components_for_bb
> +
> +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
> +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
> +  loongarch_disqualify_components
> +
> +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
> +  loongarch_emit_prologue_components
> +
> +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
> +  loongarch_emit_epilogue_components
> +
> +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
> +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
> +  loongarch_set_handled_components
> +
>   struct gcc_target targetm = TARGET_INITIALIZER;
>   
>   #include "gt-loongarch.h"
> diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
> index a9eff6a81bd..829acdaa9be 100644
> --- a/gcc/config/loongarch/loongarch.h
> +++ b/gcc/config/loongarch/loongarch.h
> @@ -1147,6 +1147,8 @@ struct GTY (()) machine_function
>     /* The current frame information, calculated by loongarch_compute_frame_info.
>      */
>     struct loongarch_frame_info frame;
> +
> +  bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
>   };
>   #endif
>   
> diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
> new file mode 100644
> index 00000000000..f2c867a2769
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fshrink-wrap" } */
> +
> +/* f(x) should do nothing if x is 0.  */
> +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */
> +
> +void g(void);
> +
> +void
> +f(int x)
> +{
> +  if (x)
> +    {
> +      register int s0 asm("s0") = x;
> +      register int s1 asm("s1") = x;
> +      register int s2 asm("s2") = x;
> +      asm("" : : "r"(s0));
> +      asm("" : : "r"(s1));
> +      asm("" : : "r"(s2));
> +      g();
> +    }
> +}
  
Lulu Cheng April 25, 2023, 7:37 a.m. UTC | #2
+guojie

在 2023/4/23 下午9:19, Xi Ruoyao 写道:
> This commit implements the target macros for shrink wrapping of function
> prologues/epilogues shrink wrapping on LoongArch.
>
> Bootstrapped and regtested on loongarch64-linux-gnu.  I don't have an
> access to SPEC CPU so I hope the reviewer can perform a benchmark to see
> if there is real benefit.
>
> gcc/ChangeLog:
>
> 	* config/loongarch/loongarch.h (struct machine_function): Add
> 	reg_is_wrapped_separately array for register wrapping
> 	information.
> 	* config/loongarch/loongarch.cc
> 	(loongarch_get_separate_components): New function.
> 	(loongarch_components_for_bb): Likewise.
> 	(loongarch_disqualify_components): Likewise.
> 	(loongarch_process_components): Likewise.
> 	(loongarch_emit_prologue_components): Likewise.
> 	(loongarch_emit_epilogue_components): Likewise.
> 	(loongarch_set_handled_components): Likewise.
> 	(TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> 	(TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> 	(TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
> 	(loongarch_for_each_saved_reg): Skip registers that are wrapped
> 	separately.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/loongarch/shrink-wrap.c: New test.
> ---
>   gcc/config/loongarch/loongarch.cc             | 179 +++++++++++++++++-
>   gcc/config/loongarch/loongarch.h              |   2 +
>   .../gcc.target/loongarch/shrink-wrap.c        |  22 +++
>   3 files changed, 200 insertions(+), 3 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
>
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index e523fcb6b7f..d0024237a6a 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3.  If not see
>   #include "builtins.h"
>   #include "rtl-iter.h"
>   #include "opts.h"
> +#include "function-abi.h"
>   
>   /* This file should be included last.  */
>   #include "target-def.h"
> @@ -1017,19 +1018,23 @@ loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset,
>     for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
>       if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
>         {
> -	loongarch_save_restore_reg (word_mode, regno, offset, fn);
> +	if (!cfun->machine->reg_is_wrapped_separately[regno])
> +	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
> +
>   	offset -= UNITS_PER_WORD;
>         }
>   
>     /* This loop must iterate over the same space as its companion in
>        loongarch_compute_frame_info.  */
>     offset = cfun->machine->frame.fp_sp_offset - sp_offset;
> +  machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +
>     for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
>       if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
>         {
> -	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +	if (!cfun->machine->reg_is_wrapped_separately[regno])
> +	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
>   
> -	loongarch_save_restore_reg (mode, regno, offset, fn);
>   	offset -= GET_MODE_SIZE (mode);
>         }
>   }
> @@ -6644,6 +6649,151 @@ loongarch_asan_shadow_offset (void)
>     return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0;
>   }
>   
> +static sbitmap
> +loongarch_get_separate_components (void)
> +{
> +  HOST_WIDE_INT offset;
> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> +  bitmap_clear (components);
> +  offset = cfun->machine->frame.gp_sp_offset;
> +
> +  /* The stack should be aligned to 16-bytes boundary, so we can make the use
> +     of ldptr instructions.  */
> +  gcc_assert (offset % UNITS_PER_WORD == 0);
> +
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> +      {
> +	/* We can wrap general registers saved at [sp, sp + 32768) using the
> +	   ldptr/stptr instructions.  For large offsets a pseudo register
> +	   might be needed which cannot be created during the shrink
> +	   wrapping pass.
> +
> +	   TODO: This may need a revise when we add LA32 as ldptr.w is not
> +	   guaranteed available by the manual.  */
> +	if (offset < 32768)
> +	  bitmap_set_bit (components, regno);
> +
> +	offset -= UNITS_PER_WORD;
> +      }
> +
> +  offset = cfun->machine->frame.fp_sp_offset;
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> +      {
> +	/* We can only wrap FP registers with imm12 offsets.  For large
> +	   offsets a pseudo register might be needed which cannot be
> +	   created during the shrink wrapping pass.  */
> +	if (IMM12_OPERAND (offset))
> +	  bitmap_set_bit (components, regno);
> +
> +	offset -= UNITS_PER_FPREG;
> +      }
> +
> +  /* Don't mess with the hard frame pointer.  */
> +  if (frame_pointer_needed)
> +    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
> +
> +  bitmap_clear_bit (components, RETURN_ADDR_REGNUM);
> +
> +  return components;
> +}
> +
> +static sbitmap
> +loongarch_components_for_bb (basic_block bb)
> +{
> +  /* Registers are used in a bb if they are in the IN, GEN, or KILL sets.  */
> +  auto_bitmap used;
> +  bitmap_copy (used, DF_LIVE_IN (bb));
> +  bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->gen);
> +  bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->kill);
> +
> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> +  bitmap_clear (components);
> +
> +  function_abi_aggregator callee_abis;
> +  rtx_insn *insn;
> +  FOR_BB_INSNS (bb, insn)
> +    if (CALL_P (insn))
> +      callee_abis.note_callee_abi (insn_callee_abi (insn));
> +
> +  HARD_REG_SET extra_caller_saves =
> +    callee_abis.caller_save_regs (*crtl->abi);
> +
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (!fixed_regs[regno]
> +	&& !crtl->abi->clobbers_full_reg_p (regno)
> +	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno) ||
> +	    bitmap_bit_p (used, regno)))
> +      bitmap_set_bit (components, regno);
> +
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (!fixed_regs[regno]
> +	&& !crtl->abi->clobbers_full_reg_p (regno)
> +	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno) ||
> +	    bitmap_bit_p (used, regno)))
> +      bitmap_set_bit (components, regno);
> +
> +  return components;
> +}
> +
> +static void
> +loongarch_disqualify_components (sbitmap, edge, sbitmap, bool)
> +{
> +  /* Do nothing.  */
> +}
> +
> +static void
> +loongarch_process_components (sbitmap components, loongarch_save_restore_fn fn)
> +{
> +  HOST_WIDE_INT offset = cfun->machine->frame.gp_sp_offset;
> +
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> +      {
> +	if (bitmap_bit_p (components, regno))
> +	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
> +
> +	offset -= UNITS_PER_WORD;
> +      }
> +
> +  offset = cfun->machine->frame.fp_sp_offset;
> +  machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> +      {
> +	if (bitmap_bit_p (components, regno))
> +	  loongarch_save_restore_reg (mode, regno, offset, fn);
> +
> +	offset -= UNITS_PER_FPREG;
> +      }
> +}
> +
> +static void
> +loongarch_emit_prologue_components (sbitmap components)
> +{
> +  loongarch_process_components (components, loongarch_save_reg);
> +}
> +
> +static void
> +loongarch_emit_epilogue_components (sbitmap components)
> +{
> +  loongarch_process_components (components, loongarch_restore_reg);
> +}
> +
> +static void
> +loongarch_set_handled_components (sbitmap components)
> +{
> +    for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +      if (bitmap_bit_p (components, regno))
> +	cfun->machine->reg_is_wrapped_separately[regno] = true;
> +
> +    for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +      if (bitmap_bit_p (components, regno))
> +	cfun->machine->reg_is_wrapped_separately[regno] = true;
> +}
> +
>   /* Initialize the GCC target structure.  */
>   #undef TARGET_ASM_ALIGNED_HI_OP
>   #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -6841,6 +6991,29 @@ loongarch_asan_shadow_offset (void)
>   #undef TARGET_ASAN_SHADOW_OFFSET
>   #define TARGET_ASAN_SHADOW_OFFSET loongarch_asan_shadow_offset
>   
> +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
> +  loongarch_get_separate_components
> +
> +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
> +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB loongarch_components_for_bb
> +
> +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
> +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
> +  loongarch_disqualify_components
> +
> +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
> +  loongarch_emit_prologue_components
> +
> +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
> +  loongarch_emit_epilogue_components
> +
> +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
> +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
> +  loongarch_set_handled_components
> +
>   struct gcc_target targetm = TARGET_INITIALIZER;
>   
>   #include "gt-loongarch.h"
> diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
> index a9eff6a81bd..829acdaa9be 100644
> --- a/gcc/config/loongarch/loongarch.h
> +++ b/gcc/config/loongarch/loongarch.h
> @@ -1147,6 +1147,8 @@ struct GTY (()) machine_function
>     /* The current frame information, calculated by loongarch_compute_frame_info.
>      */
>     struct loongarch_frame_info frame;
> +
> +  bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
>   };
>   #endif
>   
> diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
> new file mode 100644
> index 00000000000..f2c867a2769
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fshrink-wrap" } */
> +
> +/* f(x) should do nothing if x is 0.  */
> +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */
> +
> +void g(void);
> +
> +void
> +f(int x)
> +{
> +  if (x)
> +    {
> +      register int s0 asm("s0") = x;
> +      register int s1 asm("s1") = x;
> +      register int s2 asm("s2") = x;
> +      asm("" : : "r"(s0));
> +      asm("" : : "r"(s1));
> +      asm("" : : "r"(s2));
> +      g();
> +    }
> +}
  
Guo Jie April 25, 2023, 9:12 a.m. UTC | #3
/* snip */

>>   diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c 
>> b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
>> new file mode 100644
>> index 00000000000..f2c867a2769
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
>> @@ -0,0 +1,22 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O -fshrink-wrap" } */
>> +
>> +/* f(x) should do nothing if x is 0.  */
>> +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" 
>> } } */
>> +
>> +void g(void);
>> +
>> +void
>> +f(int x)
>> +{
>> +  if (x)
>> +    {
>> +      register int s0 asm("s0") = x;
>> +      register int s1 asm("s1") = x;
>> +      register int s2 asm("s2") = x;
>> +      asm("" : : "r"(s0));
>> +      asm("" : : "r"(s1));
>> +      asm("" : : "r"(s2));
>> +      g();
>> +    }
>> +}

I think the test case cannot fully reflect the optimization effect of 
the current patch,

because even without the patch, -O -fshrink-wrap will still perform 
architecture independent optimization.

This patch considers architecture related registers as finer grained 
optimization for shrink wrapping,

I think a test case like the one below is more suitable:


int foo(int x)
{
   if (x)
   {
     __asm__ ("":::"s0","s1");
     return x;
   }

   __asm__ ("":::"s2","s3");
   return 0;
}

Otherwise LGTM, thanks!
  
Lulu Cheng April 26, 2023, 9:53 a.m. UTC | #4
Hi, ruoyao:

       The performance of spec2006 is finished. The fixed-point 
400.perlbench has about 3% performance improvement,

and the other basics have not changed, and the floating-point tests have 
basically remained the same.

       Do you have any questions about the test cases mentioned by Guo 
Jie? If there is no problem, modify the test case,

I think the code can be merged into the main branch.


Thanks!

在 2023/4/25 下午5:12, Guo Jie 写道:
> /* snip */
>
>>>   diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c 
>>> b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
>>> new file mode 100644
>>> index 00000000000..f2c867a2769
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
>>> @@ -0,0 +1,22 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O -fshrink-wrap" } */
>>> +
>>> +/* f(x) should do nothing if x is 0.  */
>>> +/* { dg-final { scan-assembler 
>>> "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */
>>> +
>>> +void g(void);
>>> +
>>> +void
>>> +f(int x)
>>> +{
>>> +  if (x)
>>> +    {
>>> +      register int s0 asm("s0") = x;
>>> +      register int s1 asm("s1") = x;
>>> +      register int s2 asm("s2") = x;
>>> +      asm("" : : "r"(s0));
>>> +      asm("" : : "r"(s1));
>>> +      asm("" : : "r"(s2));
>>> +      g();
>>> +    }
>>> +}
>
> I think the test case cannot fully reflect the optimization effect of 
> the current patch,
>
> because even without the patch, -O -fshrink-wrap will still perform 
> architecture independent optimization.
>
> This patch considers architecture related registers as finer grained 
> optimization for shrink wrapping,
>
> I think a test case like the one below is more suitable:
>
>
> int foo(int x)
> {
>   if (x)
>   {
>     __asm__ ("":::"s0","s1");
>     return x;
>   }
>
>   __asm__ ("":::"s2","s3");
>   return 0;
> }
>
> Otherwise LGTM, thanks!
  
WANG Xuerui April 26, 2023, 10:02 a.m. UTC | #5
On 2023/4/26 17:53, Lulu Cheng wrote:
> Hi, ruoyao:
>
>       The performance of spec2006 is finished. The fixed-point 
> 400.perlbench has about 3% performance improvement,
>
> and the other basics have not changed, and the floating-point tests 
> have basically remained the same.
Nice to know!
>
>       Do you have any questions about the test cases mentioned by Guo 
> Jie? If there is no problem, modify the test case,
>
> I think the code can be merged into the main branch.
>
> <snip>
BTW what about the previous function/loop alignment patches? The LLVM 
changes are also waiting for such results. ;-)
  
Lulu Cheng April 26, 2023, 10:14 a.m. UTC | #6
在 2023/4/26 下午6:02, WANG Xuerui 写道:
>
> On 2023/4/26 17:53, Lulu Cheng wrote:
>> Hi, ruoyao:
>>
>>       The performance of spec2006 is finished. The fixed-point 
>> 400.perlbench has about 3% performance improvement,
>>
>> and the other basics have not changed, and the floating-point tests 
>> have basically remained the same.
> Nice to know!
>>
>>       Do you have any questions about the test cases mentioned by Guo 
>> Jie? If there is no problem, modify the test case,
>>
>> I think the code can be merged into the main branch.
>>
>> <snip>
> BTW what about the previous function/loop alignment patches? The LLVM 
> changes are also waiting for such results. ;-)
Well, there are many combinations in this align test, so the test time 
will be very long. I will reply the result as soon as the test results 
come out.:-)
  
WANG Xuerui April 26, 2023, 10:21 a.m. UTC | #7
On 2023/4/26 18:14, Lulu Cheng wrote:
>
> 在 2023/4/26 下午6:02, WANG Xuerui 写道:
>>
>> On 2023/4/26 17:53, Lulu Cheng wrote:
>>> Hi, ruoyao:
>>>
>>>       The performance of spec2006 is finished. The fixed-point 
>>> 400.perlbench has about 3% performance improvement,
>>>
>>> and the other basics have not changed, and the floating-point tests 
>>> have basically remained the same.
>> Nice to know!
>>>
>>>       Do you have any questions about the test cases mentioned by 
>>> Guo Jie? If there is no problem, modify the test case,
>>>
>>> I think the code can be merged into the main branch.
>>>
>>> <snip>
>> BTW what about the previous function/loop alignment patches? The LLVM 
>> changes are also waiting for such results. ;-)
> Well, there are many combinations in this align test, so the test time 
> will be very long. I will reply the result as soon as the test results 
> come out.:-)
>
Oh, I got. Thanks very much for all the tests and take your time!
  
Xi Ruoyao April 26, 2023, 1:29 p.m. UTC | #8
On Wed, 2023-04-26 at 17:53 +0800, Lulu Cheng wrote:
> Hi, ruoyao:
> 
>        The performance of spec2006 is finished. The fixed-point 
> 400.perlbench has about 3% performance improvement,
> 
> and the other basics have not changed, and the floating-point tests
> have 
> basically remained the same.
> 
>        Do you have any questions about the test cases mentioned by Guo
> Jie? If there is no problem, modify the test case,
> 
> I think the code can be merged into the main branch.

I'll rewrite the test and commit in a few days (now I'm occupied with
something :( ).

> 
> Thanks!
> 
> 在 2023/4/25 下午5:12, Guo Jie 写道:
> > /* snip */
> > 
> > > >   diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c 
> > > > b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
> > > > new file mode 100644
> > > > index 00000000000..f2c867a2769
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
> > > > @@ -0,0 +1,22 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O -fshrink-wrap" } */
> > > > +
> > > > +/* f(x) should do nothing if x is 0.  */
> > > > +/* { dg-final { scan-assembler 
> > > > "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */
> > > > +
> > > > +void g(void);
> > > > +
> > > > +void
> > > > +f(int x)
> > > > +{
> > > > +  if (x)
> > > > +    {
> > > > +      register int s0 asm("s0") = x;
> > > > +      register int s1 asm("s1") = x;
> > > > +      register int s2 asm("s2") = x;
> > > > +      asm("" : : "r"(s0));
> > > > +      asm("" : : "r"(s1));
> > > > +      asm("" : : "r"(s2));
> > > > +      g();
> > > > +    }
> > > > +}
> > 
> > I think the test case cannot fully reflect the optimization effect
> > of 
> > the current patch,
> > 
> > because even without the patch, -O -fshrink-wrap will still perform 
> > architecture independent optimization.
> > 
> > This patch considers architecture related registers as finer grained
> > optimization for shrink wrapping,
> > 
> > I think a test case like the one below is more suitable:
> > 
> > 
> > int foo(int x)
> > {
> >   if (x)
> >   {
> >     __asm__ ("":::"s0","s1");
> >     return x;
> >   }
> > 
> >   __asm__ ("":::"s2","s3");
> >   return 0;
> > }
> > 
> > Otherwise LGTM, thanks!
>
  
Xi Ruoyao May 6, 2023, 5:07 p.m. UTC | #9
On Wed, 2023-04-26 at 18:21 +0800, WANG Xuerui wrote:
> On 2023/4/26 18:14, Lulu Cheng wrote:
> > 
> > 在 2023/4/26 下午6:02, WANG Xuerui 写道:
> > > 
> > > On 2023/4/26 17:53, Lulu Cheng wrote:
> > > > Hi, ruoyao:
> > > > 
> > > >       The performance of spec2006 is finished. The fixed-point 
> > > > 400.perlbench has about 3% performance improvement,
> > > > 
> > > > and the other basics have not changed, and the floating-point tests 
> > > > have basically remained the same.
> > > Nice to know!
> > > > 
> > > >       Do you have any questions about the test cases mentioned by 
> > > > Guo Jie? If there is no problem, modify the test case,
> > > > 
> > > > I think the code can be merged into the main branch.
> > > > 
> > > > <snip>
> > > BTW what about the previous function/loop alignment patches? The LLVM 
> > > changes are also waiting for such results. ;-)
> > Well, there are many combinations in this align test, so the test time 
> > will be very long. I will reply the result as soon as the test results 
> > come out.:-)
> > 
> Oh, I got. Thanks very much for all the tests and take your time!

Sorry if it's noisy, but I hope there is some (maybe preliminary)
result: now I finally have some spare time to rebuild the system with
GCC 13 and I'd like to use some -falign-functions= in my CFLAGS :).
  
Lulu Cheng May 6, 2023, 11:34 p.m. UTC | #10
在 2023/5/7 上午1:07, Xi Ruoyao 写道:
> On Wed, 2023-04-26 at 18:21 +0800, WANG Xuerui wrote:
>> On 2023/4/26 18:14, Lulu Cheng wrote:
>>> 在 2023/4/26 下午6:02, WANG Xuerui 写道:
>>>> On 2023/4/26 17:53, Lulu Cheng wrote:
>>>>> Hi, ruoyao:
>>>>>
>>>>>        The performance of spec2006 is finished. The fixed-point
>>>>> 400.perlbench has about 3% performance improvement,
>>>>>
>>>>> and the other basics have not changed, and the floating-point tests
>>>>> have basically remained the same.
>>>> Nice to know!
>>>>>        Do you have any questions about the test cases mentioned by
>>>>> Guo Jie? If there is no problem, modify the test case,
>>>>>
>>>>> I think the code can be merged into the main branch.
>>>>>
>>>>> <snip>
>>>> BTW what about the previous function/loop alignment patch.es? The LLVM
>>>> changes are also waiting for such results. ;-)
>>> Well, there are many combinations in this align test, so the test time
>>> will be very long. I will reply the result as soon as the test results
>>> come out.:-)
>>>
>> Oh, I got. Thanks very much for all the tests and take your time!
> Sorry if it's noisy, but I hope there is some (maybe preliminary)
> result: now I finally have some spare time to rebuild the system with
> GCC 13 and I'd like to use some -falign-functions= in my CFLAGS :).
>
The test is still ongoing, and I will reply to the results by email 
after the test is completed.:-)
  

Patch

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index e523fcb6b7f..d0024237a6a 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -64,6 +64,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "rtl-iter.h"
 #include "opts.h"
+#include "function-abi.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -1017,19 +1018,23 @@  loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset,
   for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
     if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
       {
-	loongarch_save_restore_reg (word_mode, regno, offset, fn);
+	if (!cfun->machine->reg_is_wrapped_separately[regno])
+	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
+
 	offset -= UNITS_PER_WORD;
       }
 
   /* This loop must iterate over the same space as its companion in
      loongarch_compute_frame_info.  */
   offset = cfun->machine->frame.fp_sp_offset - sp_offset;
+  machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
+
   for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
     if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
       {
-	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
+	if (!cfun->machine->reg_is_wrapped_separately[regno])
+	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
 
-	loongarch_save_restore_reg (mode, regno, offset, fn);
 	offset -= GET_MODE_SIZE (mode);
       }
 }
@@ -6644,6 +6649,151 @@  loongarch_asan_shadow_offset (void)
   return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0;
 }
 
+static sbitmap
+loongarch_get_separate_components (void)
+{
+  HOST_WIDE_INT offset;
+  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+  bitmap_clear (components);
+  offset = cfun->machine->frame.gp_sp_offset;
+
+  /* The stack should be aligned to 16-bytes boundary, so we can make the use
+     of ldptr instructions.  */
+  gcc_assert (offset % UNITS_PER_WORD == 0);
+
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
+      {
+	/* We can wrap general registers saved at [sp, sp + 32768) using the
+	   ldptr/stptr instructions.  For large offsets a pseudo register
+	   might be needed which cannot be created during the shrink
+	   wrapping pass.
+
+	   TODO: This may need a revise when we add LA32 as ldptr.w is not
+	   guaranteed available by the manual.  */
+	if (offset < 32768)
+	  bitmap_set_bit (components, regno);
+
+	offset -= UNITS_PER_WORD;
+      }
+
+  offset = cfun->machine->frame.fp_sp_offset;
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
+      {
+	/* We can only wrap FP registers with imm12 offsets.  For large
+	   offsets a pseudo register might be needed which cannot be
+	   created during the shrink wrapping pass.  */
+	if (IMM12_OPERAND (offset))
+	  bitmap_set_bit (components, regno);
+
+	offset -= UNITS_PER_FPREG;
+      }
+
+  /* Don't mess with the hard frame pointer.  */
+  if (frame_pointer_needed)
+    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+
+  bitmap_clear_bit (components, RETURN_ADDR_REGNUM);
+
+  return components;
+}
+
+static sbitmap
+loongarch_components_for_bb (basic_block bb)
+{
+  /* Registers are used in a bb if they are in the IN, GEN, or KILL sets.  */
+  auto_bitmap used;
+  bitmap_copy (used, DF_LIVE_IN (bb));
+  bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->gen);
+  bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->kill);
+
+  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+  bitmap_clear (components);
+
+  function_abi_aggregator callee_abis;
+  rtx_insn *insn;
+  FOR_BB_INSNS (bb, insn)
+    if (CALL_P (insn))
+      callee_abis.note_callee_abi (insn_callee_abi (insn));
+
+  HARD_REG_SET extra_caller_saves =
+    callee_abis.caller_save_regs (*crtl->abi);
+
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (!fixed_regs[regno]
+	&& !crtl->abi->clobbers_full_reg_p (regno)
+	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno) ||
+	    bitmap_bit_p (used, regno)))
+      bitmap_set_bit (components, regno);
+
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (!fixed_regs[regno]
+	&& !crtl->abi->clobbers_full_reg_p (regno)
+	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno) ||
+	    bitmap_bit_p (used, regno)))
+      bitmap_set_bit (components, regno);
+
+  return components;
+}
+
+static void
+loongarch_disqualify_components (sbitmap, edge, sbitmap, bool)
+{
+  /* Do nothing.  */
+}
+
+static void
+loongarch_process_components (sbitmap components, loongarch_save_restore_fn fn)
+{
+  HOST_WIDE_INT offset = cfun->machine->frame.gp_sp_offset;
+
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
+      {
+	if (bitmap_bit_p (components, regno))
+	  loongarch_save_restore_reg (word_mode, regno, offset, fn);
+
+	offset -= UNITS_PER_WORD;
+      }
+
+  offset = cfun->machine->frame.fp_sp_offset;
+  machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
+
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
+      {
+	if (bitmap_bit_p (components, regno))
+	  loongarch_save_restore_reg (mode, regno, offset, fn);
+
+	offset -= UNITS_PER_FPREG;
+      }
+}
+
+static void
+loongarch_emit_prologue_components (sbitmap components)
+{
+  loongarch_process_components (components, loongarch_save_reg);
+}
+
+static void
+loongarch_emit_epilogue_components (sbitmap components)
+{
+  loongarch_process_components (components, loongarch_restore_reg);
+}
+
+static void
+loongarch_set_handled_components (sbitmap components)
+{
+    for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+      if (bitmap_bit_p (components, regno))
+	cfun->machine->reg_is_wrapped_separately[regno] = true;
+
+    for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+      if (bitmap_bit_p (components, regno))
+	cfun->machine->reg_is_wrapped_separately[regno] = true;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -6841,6 +6991,29 @@  loongarch_asan_shadow_offset (void)
 #undef TARGET_ASAN_SHADOW_OFFSET
 #define TARGET_ASAN_SHADOW_OFFSET loongarch_asan_shadow_offset
 
+#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
+  loongarch_get_separate_components
+
+#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB loongarch_components_for_bb
+
+#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
+  loongarch_disqualify_components
+
+#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
+  loongarch_emit_prologue_components
+
+#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
+  loongarch_emit_epilogue_components
+
+#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
+  loongarch_set_handled_components
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-loongarch.h"
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index a9eff6a81bd..829acdaa9be 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -1147,6 +1147,8 @@  struct GTY (()) machine_function
   /* The current frame information, calculated by loongarch_compute_frame_info.
    */
   struct loongarch_frame_info frame;
+
+  bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
 };
 #endif
 
diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
new file mode 100644
index 00000000000..f2c867a2769
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap" } */
+
+/* f(x) should do nothing if x is 0.  */
+/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */
+
+void g(void);
+
+void
+f(int x)
+{
+  if (x)
+    {
+      register int s0 asm("s0") = x;
+      register int s1 asm("s1") = x;
+      register int s2 asm("s2") = x;
+      asm("" : : "r"(s0));
+      asm("" : : "r"(s1));
+      asm("" : : "r"(s2));
+      g();
+    }
+}