[2/3] RISC-V: optimize stack manipulation in save-restore

Message ID 20221201100332.22226-3-gaofei@eswincomputing.com
State Deferred, archived
Headers
Series RISC-V: optimize stack manipulation in save-restore |

Commit Message

Fei Gao Dec. 1, 2022, 10:03 a.m. UTC
  The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled.

before patch:
  bar:
    call	t0,__riscv_save_4
    addi	sp,sp,-64
    ...
    li	t0,-12288
    addi	t0,t0,-1968 # optimized out after patch
    add	sp,sp,t0 # prologue
    ...
    li	t0,12288 # epilogue
    addi	t0,t0,2000 # optimized out after patch
    add	sp,sp,t0
    ...
    addi	sp,sp,32
    tail	__riscv_restore_4

after patch:
  bar:
    call	t0,__riscv_save_4
    addi	sp,sp,-2032
    ...
    li	t0,-12288
    add	sp,sp,t0 # prologue
    ...
    li	t0,12288 # epilogue
    add	sp,sp,t0
    ...
    addi	sp,sp,2032
    tail	__riscv_restore_4

gcc/ChangeLog:

        * config/riscv/riscv.cc (riscv_expand_prologue): consider save-restore in stack allocation.
        (riscv_expand_epilogue): consider save-restore in stack deallocation.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/stack_save_restore.c: New test.
---
 gcc/config/riscv/riscv.cc                     | 50 ++++++++++---------
 .../gcc.target/riscv/stack_save_restore.c     | 40 +++++++++++++++
 2 files changed, 66 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/stack_save_restore.c
  

Comments

Fei Gao Feb. 3, 2023, 8:52 a.m. UTC | #1
Gentle ping.

The patch I previously submitted:
| Date: Wed, 30 Nov 2022 00:38:08 -0800
| Subject: [PATCH] RISC-V: optimize stack manipulation in save-restore
| Message-ID: <gaofei@eswincomputing.com>

I split the patches as per Palmer's review comment.

BR
Fei

>The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
>This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled.
>
>before patch:
>  bar:
>    call	t0,__riscv_save_4
>    addi	sp,sp,-64
>    ...
>    li	t0,-12288
>    addi	t0,t0,-1968 # optimized out after patch
>    add	sp,sp,t0 # prologue
>    ...
>    li	t0,12288 # epilogue
>    addi	t0,t0,2000 # optimized out after patch
>    add	sp,sp,t0
>    ...
>    addi	sp,sp,32
>    tail	__riscv_restore_4
>
>after patch:
>  bar:
>    call	t0,__riscv_save_4
>    addi	sp,sp,-2032
>    ...
>    li	t0,-12288
>    add	sp,sp,t0 # prologue
>    ...
>    li	t0,12288 # epilogue
>    add	sp,sp,t0
>    ...
>    addi	sp,sp,2032
>    tail	__riscv_restore_4
>
>gcc/ChangeLog:
>
>        * config/riscv/riscv.cc (riscv_expand_prologue): consider save-restore in stack allocation.
>        (riscv_expand_epilogue): consider save-restore in stack deallocation.
>
>gcc/testsuite/ChangeLog:
>
>        * gcc.target/riscv/stack_save_restore.c: New test.
>---
> gcc/config/riscv/riscv.cc                     | 50 ++++++++++---------
> .../gcc.target/riscv/stack_save_restore.c     | 40 +++++++++++++++
> 2 files changed, 66 insertions(+), 24 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>
>diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>index f0bbcd6d6be..a50f2303032 100644
>--- a/gcc/config/riscv/riscv.cc
>+++ b/gcc/config/riscv/riscv.cc
>@@ -5010,12 +5010,12 @@ void
> riscv_expand_prologue (void)
> {
>   struct riscv_frame_info *frame = &cfun->machine->frame;
>-  poly_int64 size = frame->total_size;
>+  poly_int64 remaining_size = frame->total_size;
>   unsigned mask = frame->mask;
>   rtx insn;
>
>   if (flag_stack_usage_info)
>-    current_function_static_stack_size = constant_lower_bound (size);
>+    current_function_static_stack_size = constant_lower_bound (remaining_size);
>
>   if (cfun->machine->naked_p)
>     return;
>@@ -5026,7 +5026,7 @@ riscv_expand_prologue (void)
>       rtx dwarf = NULL_RTX;
>       dwarf = riscv_adjust_libcall_cfi_prologue ();
>
>-      size -= frame->save_libcall_adjustment;
>+      remaining_size -= frame->save_libcall_adjustment;
>       insn = emit_insn (riscv_gen_gpr_save_insn (frame));
>       frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
>
>@@ -5037,16 +5037,14 @@ riscv_expand_prologue (void)
>   /* Save the registers.  */
>   if ((frame->mask | frame->fmask) != 0)
>     {
>-      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, frame->total_size);
>-      if (size.is_constant ())
>-	step1 = MIN (size.to_constant(), step1);
>+      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, remaining_size);
>
>       insn = gen_add3_insn (stack_pointer_rtx,
>     stack_pointer_rtx,
>     GEN_INT (-step1));
>       RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
>-      size -= step1;
>-      riscv_for_each_saved_reg (size, riscv_save_reg, false, false);
>+      remaining_size -= step1;
>+      riscv_for_each_saved_reg (remaining_size, riscv_save_reg, false, false);
>     }
>
>   frame->mask = mask; /* Undo the above fib.  */
>@@ -5055,29 +5053,29 @@ riscv_expand_prologue (void)
>   if (frame_pointer_needed)
>     {
>       insn = gen_add3_insn (hard_frame_pointer_rtx, stack_pointer_rtx,
>-	    GEN_INT ((frame->hard_frame_pointer_offset - size).to_constant ()));
>+	    GEN_INT ((frame->hard_frame_pointer_offset - remaining_size).to_constant ()));
>       RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
>
>       riscv_emit_stack_tie ();
>     }
>
>   /* Allocate the rest of the frame.  */
>-  if (known_gt (size, 0))
>+  if (known_gt (remaining_size, 0))
>     {
>       /* Two step adjustment:
> 1.scalable frame. 2.constant frame.  */
>       poly_int64 scalable_frame (0, 0);
>-      if (!size.is_constant ())
>+      if (!remaining_size.is_constant ())
> {
>   /* First for scalable frame.  */
>-	  poly_int64 scalable_frame = size;
>-	  scalable_frame.coeffs[0] = size.coeffs[1];
>+	  poly_int64 scalable_frame = remaining_size;
>+	  scalable_frame.coeffs[0] = remaining_size.coeffs[1];
>   riscv_v_adjust_scalable_frame (stack_pointer_rtx, scalable_frame, false);
>-	  size -= scalable_frame;
>+	  remaining_size -= scalable_frame;
> }
>
>       /* Second step for constant frame.  */
>-      HOST_WIDE_INT constant_frame = size.to_constant ();
>+      HOST_WIDE_INT constant_frame = remaining_size.to_constant ();
>       if (constant_frame == 0)
> return;
>
>@@ -5142,6 +5140,8 @@ riscv_expand_epilogue (int style)
>   HOST_WIDE_INT step2 = 0;
>   bool use_restore_libcall = ((style == NORMAL_RETURN)
>       && riscv_use_save_libcall (frame));
>+  unsigned libcall_size = use_restore_libcall ?
>+                            frame->save_libcall_adjustment : 0;
>   rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
>   rtx insn;
>
>@@ -5212,13 +5212,18 @@ riscv_expand_epilogue (int style)
>       REG_NOTES (insn) = dwarf;
>     }
>
>+  if (use_restore_libcall)
>+    frame->mask = 0; /* Temporarily fib for GPRs.  */
>+
>   /* If we need to restore registers, deallocate as much stack as
>      possible in the second step without going out of range.  */
>   if ((frame->mask | frame->fmask) != 0)
>-    {
>-      step2 = riscv_first_stack_step (frame, frame->total_size);
>-      step1 -= step2;
>-    }
>+    step2 = riscv_first_stack_step (frame, frame->total_size - libcall_size);
>+
>+  if (use_restore_libcall)
>+    frame->mask = mask; /* Undo the above fib.  */
>+
>+  step1 -= step2 + libcall_size;
>
>   /* Set TARGET to BASE + STEP1.  */
>   if (known_gt (step1, 0))
>@@ -5272,15 +5277,12 @@ riscv_expand_epilogue (int style)
>     frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
>
>   /* Restore the registers.  */
>-  riscv_for_each_saved_reg (frame->total_size - step2, riscv_restore_reg,
>+  riscv_for_each_saved_reg (frame->total_size - step2 - libcall_size,
>+                            riscv_restore_reg,
>     true, style == EXCEPTION_RETURN);
>
>   if (use_restore_libcall)
>-    {
>       frame->mask = mask; /* Undo the above fib.  */
>-      gcc_assert (step2 >= frame->save_libcall_adjustment);
>-      step2 -= frame->save_libcall_adjustment;
>-    }
>
>   if (need_barrier_p)
>     riscv_emit_stack_tie ();
>diff --git a/gcc/testsuite/gcc.target/riscv/stack_save_restore.c b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>new file mode 100644
>index 00000000000..522e706cfbf
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>@@ -0,0 +1,40 @@
>+/* { dg-do compile } */
>+/* { dg-options "-march=rv32imafc -mabi=ilp32f -msave-restore -O2 -fno-schedule-insns -fno-schedule-insns2 -fno-unroll-loops -fno-peel-loops -fno-lto" } */
>+/* { dg-final { check-function-bodies "**" "" } } */
>+
>+char my_getchar();
>+float getf();
>+
>+/*
>+**bar:
>+**	call	t0,__riscv_save_4
>+**	addi	sp,sp,-2032
>+**	...
>+**	li	t0,-12288
>+**	add	sp,sp,t0
>+**	...
>+**	li	t0,12288
>+**	add	sp,sp,t0
>+**	...
>+**	addi	sp,sp,2032
>+**	tail	__riscv_restore_4
>+*/
>+int bar()
>+{
>+  float volatile farray[3568];
>+
>+  float sum = 0;
>+  float f1 = getf();
>+  float f2 = getf();
>+  float f3 = getf();
>+  float f4 = getf();
>+
>+  for (int i = 0; i < 3568; i++)
>+  {
>+    farray[i] = my_getchar() * 1.2;
>+    sum += farray[i];
>+  }
>+
>+  return sum + f1 + f2 + f3 + f4;
>+}
>+
>--
>2.17.1
  
Jeff Law April 16, 2023, 4:45 p.m. UTC | #2
On 12/1/22 03:03, Fei Gao wrote:
> The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
> This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled.
Haha!  I should have read the whole series before commenting on the 
first patch.  I think this addresses the precise issue I was asking 
about in my prior message.

Jeff
  
Jeff Law April 17, 2023, 10:51 p.m. UTC | #3
On 12/1/22 03:03, Fei Gao wrote:
> The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
> This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled.
> 
> before patch:
>    bar:
>      call	t0,__riscv_save_4
>      addi	sp,sp,-64
>      ...
>      li	t0,-12288
>      addi	t0,t0,-1968 # optimized out after patch
>      add	sp,sp,t0 # prologue
>      ...
>      li	t0,12288 # epilogue
>      addi	t0,t0,2000 # optimized out after patch
>      add	sp,sp,t0
>      ...
>      addi	sp,sp,32
>      tail	__riscv_restore_4
> 
> after patch:
>    bar:
>      call	t0,__riscv_save_4
>      addi	sp,sp,-2032
>      ...
>      li	t0,-12288
>      add	sp,sp,t0 # prologue
>      ...
>      li	t0,12288 # epilogue
>      add	sp,sp,t0
>      ...
>      addi	sp,sp,2032
>      tail	__riscv_restore_4
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv.cc (riscv_expand_prologue): consider save-restore in stack allocation.
>          (riscv_expand_epilogue): consider save-restore in stack deallocation.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/stack_save_restore.c: New test.
I made a couple of whitespace fixes and pushed this to the trunk after 
running it through a cross testing cycle.

Thanks!

jeff
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index f0bbcd6d6be..a50f2303032 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -5010,12 +5010,12 @@  void
 riscv_expand_prologue (void)
 {
   struct riscv_frame_info *frame = &cfun->machine->frame;
-  poly_int64 size = frame->total_size;
+  poly_int64 remaining_size = frame->total_size;
   unsigned mask = frame->mask;
   rtx insn;
 
   if (flag_stack_usage_info)
-    current_function_static_stack_size = constant_lower_bound (size);
+    current_function_static_stack_size = constant_lower_bound (remaining_size);
 
   if (cfun->machine->naked_p)
     return;
@@ -5026,7 +5026,7 @@  riscv_expand_prologue (void)
       rtx dwarf = NULL_RTX;
       dwarf = riscv_adjust_libcall_cfi_prologue ();
 
-      size -= frame->save_libcall_adjustment;
+      remaining_size -= frame->save_libcall_adjustment;
       insn = emit_insn (riscv_gen_gpr_save_insn (frame));
       frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
 
@@ -5037,16 +5037,14 @@  riscv_expand_prologue (void)
   /* Save the registers.  */
   if ((frame->mask | frame->fmask) != 0)
     {
-      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, frame->total_size);
-      if (size.is_constant ())
-	step1 = MIN (size.to_constant(), step1);
+      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, remaining_size);
 
       insn = gen_add3_insn (stack_pointer_rtx,
 			    stack_pointer_rtx,
 			    GEN_INT (-step1));
       RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
-      size -= step1;
-      riscv_for_each_saved_reg (size, riscv_save_reg, false, false);
+      remaining_size -= step1;
+      riscv_for_each_saved_reg (remaining_size, riscv_save_reg, false, false);
     }
 
   frame->mask = mask; /* Undo the above fib.  */
@@ -5055,29 +5053,29 @@  riscv_expand_prologue (void)
   if (frame_pointer_needed)
     {
       insn = gen_add3_insn (hard_frame_pointer_rtx, stack_pointer_rtx,
-			    GEN_INT ((frame->hard_frame_pointer_offset - size).to_constant ()));
+			    GEN_INT ((frame->hard_frame_pointer_offset - remaining_size).to_constant ()));
       RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
 
       riscv_emit_stack_tie ();
     }
 
   /* Allocate the rest of the frame.  */
-  if (known_gt (size, 0))
+  if (known_gt (remaining_size, 0))
     {
       /* Two step adjustment:
 	 1.scalable frame. 2.constant frame.  */
       poly_int64 scalable_frame (0, 0);
-      if (!size.is_constant ())
+      if (!remaining_size.is_constant ())
 	{
 	  /* First for scalable frame.  */
-	  poly_int64 scalable_frame = size;
-	  scalable_frame.coeffs[0] = size.coeffs[1];
+	  poly_int64 scalable_frame = remaining_size;
+	  scalable_frame.coeffs[0] = remaining_size.coeffs[1];
 	  riscv_v_adjust_scalable_frame (stack_pointer_rtx, scalable_frame, false);
-	  size -= scalable_frame;
+	  remaining_size -= scalable_frame;
 	}
 
       /* Second step for constant frame.  */
-      HOST_WIDE_INT constant_frame = size.to_constant ();
+      HOST_WIDE_INT constant_frame = remaining_size.to_constant ();
       if (constant_frame == 0)
 	return;
 
@@ -5142,6 +5140,8 @@  riscv_expand_epilogue (int style)
   HOST_WIDE_INT step2 = 0;
   bool use_restore_libcall = ((style == NORMAL_RETURN)
 			      && riscv_use_save_libcall (frame));
+  unsigned libcall_size = use_restore_libcall ?
+                            frame->save_libcall_adjustment : 0;
   rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
   rtx insn;
 
@@ -5212,13 +5212,18 @@  riscv_expand_epilogue (int style)
       REG_NOTES (insn) = dwarf;
     }
 
+  if (use_restore_libcall)
+    frame->mask = 0; /* Temporarily fib for GPRs.  */
+
   /* If we need to restore registers, deallocate as much stack as
      possible in the second step without going out of range.  */
   if ((frame->mask | frame->fmask) != 0)
-    {
-      step2 = riscv_first_stack_step (frame, frame->total_size);
-      step1 -= step2;
-    }
+    step2 = riscv_first_stack_step (frame, frame->total_size - libcall_size);
+
+  if (use_restore_libcall)
+    frame->mask = mask; /* Undo the above fib.  */
+
+  step1 -= step2 + libcall_size;
 
   /* Set TARGET to BASE + STEP1.  */
   if (known_gt (step1, 0))
@@ -5272,15 +5277,12 @@  riscv_expand_epilogue (int style)
     frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
 
   /* Restore the registers.  */
-  riscv_for_each_saved_reg (frame->total_size - step2, riscv_restore_reg,
+  riscv_for_each_saved_reg (frame->total_size - step2 - libcall_size,
+                            riscv_restore_reg,
 			    true, style == EXCEPTION_RETURN);
 
   if (use_restore_libcall)
-    {
       frame->mask = mask; /* Undo the above fib.  */
-      gcc_assert (step2 >= frame->save_libcall_adjustment);
-      step2 -= frame->save_libcall_adjustment;
-    }
 
   if (need_barrier_p)
     riscv_emit_stack_tie ();
diff --git a/gcc/testsuite/gcc.target/riscv/stack_save_restore.c b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
new file mode 100644
index 00000000000..522e706cfbf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32imafc -mabi=ilp32f -msave-restore -O2 -fno-schedule-insns -fno-schedule-insns2 -fno-unroll-loops -fno-peel-loops -fno-lto" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+char my_getchar();
+float getf();
+
+/*
+**bar:
+**	call	t0,__riscv_save_4
+**	addi	sp,sp,-2032
+**	...
+**	li	t0,-12288
+**	add	sp,sp,t0
+**	...
+**	li	t0,12288
+**	add	sp,sp,t0
+**	...
+**	addi	sp,sp,2032
+**	tail	__riscv_restore_4
+*/
+int bar()
+{
+  float volatile farray[3568];
+
+  float sum = 0;
+  float f1 = getf();
+  float f2 = getf();
+  float f3 = getf();
+  float f4 = getf();
+
+  for (int i = 0; i < 3568; i++)
+  {
+    farray[i] = my_getchar() * 1.2;
+    sum += farray[i];
+  }
+
+  return sum + f1 + f2 + f3 + f4;
+}
+