[3/3] xtensa: Rework 'setmemsi' insn pattern

Message ID 32bc6e6d-a273-2e8f-45e3-d3a3abc27ea7@yahoo.co.jp
State New
Headers
Series [1/3] xtensa: Addendum of the commit e33d2dcb463161a110ac345a451132ce8b2b23d9 |

Commit Message

Takayuki 'January June' Suwa May 25, 2023, 3:08 p.m. UTC
  In order to reject voodoo estimation logic with lots of magic numbers,
this patch revises the code to measure the costs of the three memset
methods based on the actual emission size of the insn sequence
corresponding to each method and choose the smallest one.

gcc/ChangeLog:

	* config/xtensa/xtensa-protos.h
	(xtensa_expand_block_set_unrolled_loop,
	xtensa_expand_block_set_small_loop): Remove.
	(xtensa_expand_block_set): New prototype.
	* config/xtensa/xtensa.cc
	(xtensa_expand_block_set_libcall): New subfunction.
	(xtensa_expand_block_set_unrolled_loop,
	xtensa_expand_block_set_small_loop): Rewrite as subfunctions.
	(xtensa_expand_block_set): New function that calls the above
	subfunctions.
	* config/xtensa/xtensa.md (memsetsi): Change to invoke only
	xtensa_expand_block_set().
---
 gcc/config/xtensa/xtensa-protos.h |   3 +-
 gcc/config/xtensa/xtensa.cc       | 319 ++++++++++++++++--------------
 gcc/config/xtensa/xtensa.md       |   4 +-
 3 files changed, 172 insertions(+), 154 deletions(-)
  

Comments

Max Filippov May 26, 2023, 10:21 a.m. UTC | #1
On Thu, May 25, 2023 at 8:13 AM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> In order to reject voodoo estimation logic with lots of magic numbers,
> this patch revises the code to measure the costs of the three memset
> methods based on the actual emission size of the insn sequence
> corresponding to each method and choose the smallest one.
>
> gcc/ChangeLog:
>
>         * config/xtensa/xtensa-protos.h
>         (xtensa_expand_block_set_unrolled_loop,
>         xtensa_expand_block_set_small_loop): Remove.
>         (xtensa_expand_block_set): New prototype.
>         * config/xtensa/xtensa.cc
>         (xtensa_expand_block_set_libcall): New subfunction.
>         (xtensa_expand_block_set_unrolled_loop,
>         xtensa_expand_block_set_small_loop): Rewrite as subfunctions.
>         (xtensa_expand_block_set): New function that calls the above
>         subfunctions.
>         * config/xtensa/xtensa.md (memsetsi): Change to invoke only
>         xtensa_expand_block_set().
> ---
>  gcc/config/xtensa/xtensa-protos.h |   3 +-
>  gcc/config/xtensa/xtensa.cc       | 319 ++++++++++++++++--------------
>  gcc/config/xtensa/xtensa.md       |   4 +-
>  3 files changed, 172 insertions(+), 154 deletions(-)

Regtested for target=xtensa-linux-uclibc, no new regressions.
Committed to master.

--
Thanks.
-- Max
  

Patch

diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h
index ec715b44e4d..b0b15a42799 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -42,8 +42,7 @@  extern void xtensa_expand_conditional_branch (rtx *, machine_mode);
 extern int xtensa_expand_conditional_move (rtx *, int);
 extern int xtensa_expand_scc (rtx *, machine_mode);
 extern int xtensa_expand_block_move (rtx *);
-extern int xtensa_expand_block_set_unrolled_loop (rtx *);
-extern int xtensa_expand_block_set_small_loop (rtx *);
+extern int xtensa_expand_block_set (rtx *);
 extern void xtensa_split_operand_pair (rtx *, machine_mode);
 extern int xtensa_constantsynth (rtx, HOST_WIDE_INT);
 extern int xtensa_emit_move_sequence (rtx *, machine_mode);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 46ab9f36b56..3b5d25b660a 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -57,6 +57,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "rtl-iter.h"
 #include "insn-attr.h"
 #include "tree-pass.h"
+#include "print-rtl.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -1530,77 +1531,61 @@  xtensa_expand_block_move (rtx *operands)
 }
 
 
-/* Try to expand a block set operation to a sequence of RTL move
-   instructions.  If not optimizing, or if the block size is not a
-   constant, or if the block is too large, or if the value to
-   initialize the block with is not a constant, the expansion
-   fails and GCC falls back to calling memset().
+/* Worker function for xtensa_expand_block_set().
 
-   operands[0] is the destination
-   operands[1] is the length
-   operands[2] is the initialization value
-   operands[3] is the alignment */
+   Expand into an insn sequence that calls the "memset" function.  */
 
-static int
-xtensa_sizeof_MOVI (HOST_WIDE_INT imm)
+static rtx_insn *
+xtensa_expand_block_set_libcall (rtx dst_mem,
+				 HOST_WIDE_INT value,
+				 HOST_WIDE_INT bytes)
 {
-  return (TARGET_DENSITY && IN_RANGE (imm, -32, 95)) ? 2 : 3;
+  rtx reg;
+  rtx_insn *seq;
+
+  start_sequence ();
+
+  reg = XEXP (dst_mem, 0);
+  if (! REG_P (reg))
+    reg = XEXP (replace_equiv_address (dst_mem,
+				       force_reg (Pmode, reg)), 0);
+  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "memset"),
+		     LCT_NORMAL, VOIDmode,
+		     reg, SImode,
+		     GEN_INT (value), SImode,
+		     GEN_INT (bytes), SImode);
+
+  seq = get_insns ();
+  end_sequence ();
+
+  return seq;
 }
 
-int
-xtensa_expand_block_set_unrolled_loop (rtx *operands)
+/* Worker function for xtensa_expand_block_set().
+
+   Expand into an insn sequence of one constant load followed by multiple
+   memory stores.  Returns NULL if the conditions for expansion are not
+   met.  */
+
+static rtx_insn *
+xtensa_expand_block_set_unrolled_loop (rtx dst_mem,
+				       HOST_WIDE_INT value,
+				       HOST_WIDE_INT bytes,
+				       HOST_WIDE_INT align)
 {
-  rtx dst_mem = operands[0];
-  HOST_WIDE_INT bytes, value, align;
-  int expand_len, funccall_len;
-  rtx x, reg;
+  rtx reg;
   int offset;
+  rtx_insn *seq;
 
-  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
-    return 0;
+  if (bytes > 64)
+    return NULL;
 
-  bytes = INTVAL (operands[1]);
-  if (bytes <= 0)
-    return 0;
-  value = (int8_t)INTVAL (operands[2]);
-  align = INTVAL (operands[3]);
-  if (align > MOVE_MAX)
-    align = MOVE_MAX;
-
-  /* Insn expansion: holding the init value.
-     Either MOV(.N) or L32R w/litpool.  */
-  if (align == 1)
-    expand_len = xtensa_sizeof_MOVI (value);
-  else if (value == 0 || value == -1)
-    expand_len = TARGET_DENSITY ? 2 : 3;
-  else
-    expand_len = 3 + 4;
-  /* Insn expansion: a series of aligned memory stores.
-     Consist of S8I, S16I or S32I(.N).  */
-  expand_len += (bytes / align) * (TARGET_DENSITY
-				   && align == 4 ? 2 : 3);
-  /* Insn expansion: the remainder, sub-aligned memory stores.
-     A combination of S8I and S16I as needed.  */
-  expand_len += ((bytes % align + 1) / 2) * 3;
-
-  /* Function call: preparing two arguments.  */
-  funccall_len = xtensa_sizeof_MOVI (value);
-  funccall_len += xtensa_sizeof_MOVI (bytes);
-  /* Function call: calling memset().  */
-  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
-
-  /* Apply expansion bonus (2x) if optimizing for speed.  */
-  if (optimize > 1 && !optimize_size)
-    funccall_len *= 2;
-
-  /* Decide whether to expand or not, based on the sum of the length
-     of instructions.  */
-  if (expand_len > funccall_len)
-    return 0;
+  start_sequence ();
 
-  x = XEXP (dst_mem, 0);
-  if (!REG_P (x))
-    dst_mem = replace_equiv_address (dst_mem, force_reg (Pmode, x));
+  reg = XEXP (dst_mem, 0);
+  if (! REG_P (reg))
+    dst_mem = replace_equiv_address (dst_mem,
+				     force_reg (Pmode, reg));
   switch (align)
     {
     case 1:
@@ -1621,45 +1606,43 @@  xtensa_expand_block_set_unrolled_loop (rtx *operands)
     {
       int unit_size = MIN (bytes, align);
       machine_mode unit_mode = (unit_size >= 4 ? SImode :
-			       (unit_size >= 2 ? HImode :
-						 QImode));
-      unit_size = GET_MODE_SIZE (unit_mode);
+			       (unit_size >= 2 ? HImode : QImode));
 
+      unit_size = GET_MODE_SIZE (unit_mode);
       emit_move_insn (adjust_address (dst_mem, unit_mode, offset),
-		      unit_mode == SImode ? reg
-		      : convert_to_mode (unit_mode, reg, true));
-
+		      (unit_mode == SImode) ? reg
+		       : convert_to_mode (unit_mode, reg, true));
       offset += unit_size;
       bytes -= unit_size;
     }
   while (bytes > 0);
 
-  return 1;
+  seq = get_insns ();
+  end_sequence ();
+
+  return seq;
 }
 
-int
-xtensa_expand_block_set_small_loop (rtx *operands)
+/* Worker function for xtensa_expand_block_set(),
+
+   Expand into an insn sequence of a small loop that fill the memory
+   range.  Returns NULL if the conditions for expansion are not met.  */
+
+static rtx_insn *
+xtensa_expand_block_set_small_loop (rtx dst_mem,
+				    HOST_WIDE_INT value,
+				    HOST_WIDE_INT bytes,
+				    HOST_WIDE_INT align)
 {
-  HOST_WIDE_INT bytes, value, align, count;
-  int expand_len, funccall_len;
-  rtx x, dst, end, reg;
+  HOST_WIDE_INT count;
+  rtx reg, dst, end;
   machine_mode unit_mode;
   rtx_code_label *label;
-
-  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
-    return 0;
-
-  bytes = INTVAL (operands[1]);
-  if (bytes <= 0)
-    return 0;
-  value = (int8_t)INTVAL (operands[2]);
-  align = INTVAL (operands[3]);
-  if (align > MOVE_MAX)
-    align = MOVE_MAX;
+  rtx_insn *seq;
 
   /* Totally-aligned block only.  */
   if (bytes % align != 0)
-    return 0;
+    return NULL;
   count = bytes / align;
 
   /* If the Loop Option (zero-overhead looping) is configured and active,
@@ -1671,77 +1654,28 @@  xtensa_expand_block_set_small_loop (rtx *operands)
 	 instruction.  */
       if (align == 4
 	  && ! (bytes <= 127 || xtensa_simm8x256 (bytes)))
-	return 0;
+	return NULL;
 
       /* If no 4-byte aligned, loop count should be treated as the
 	 constraint.  */
       if (align != 4
 	  && count > ((optimize > 1 && !optimize_size) ? 8 : 15))
-	return 0;
+	return NULL;
     }
 
-  /* Insn expansion: holding the init value.
-     Either MOV(.N) or L32R w/litpool.  */
-  if (align == 1)
-    expand_len = xtensa_sizeof_MOVI (value);
-  else if (value == 0 || value == -1)
-    expand_len = TARGET_DENSITY ? 2 : 3;
-  else
-    expand_len = 3 + 4;
-  if (TARGET_LOOPS && optimize) /* zero-overhead looping */
-    {
-      /* Insn translation: Either MOV(.N) or L32R w/litpool for the
-	 loop count.  */
-      expand_len += xtensa_simm12b (count) ? xtensa_sizeof_MOVI (count)
-					   : 3 + 4;
-      /* Insn translation: LOOP, the zero-overhead looping setup
-	 instruction.  */
-      expand_len += 3;
-      /* Insn expansion: the loop body instructions.
-	For store, one of S8I, S16I or S32I(.N).
-	For advance, ADDI(.N).  */
-      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
-		    + (TARGET_DENSITY ? 2 : 3);
-    }
-  else /* NO zero-overhead looping */
-    {
-      /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
-      expand_len += bytes > 127 ? 3
-				: (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
-      /* Insn expansion: the loop body and branch instruction.
-	For store, one of S8I, S16I or S32I(.N).
-	For advance, ADDI(.N).
-	For branch, BNE.  */
-      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
-		    + (TARGET_DENSITY ? 2 : 3) + 3;
-    }
-
-  /* Function call: preparing two arguments.  */
-  funccall_len = xtensa_sizeof_MOVI (value);
-  funccall_len += xtensa_sizeof_MOVI (bytes);
-  /* Function call: calling memset().  */
-  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
-
-  /* Apply expansion bonus (2x) if optimizing for speed.  */
-  if (optimize > 1 && !optimize_size)
-    funccall_len *= 2;
-
-  /* Decide whether to expand or not, based on the sum of the length
-     of instructions.  */
-  if (expand_len > funccall_len)
-    return 0;
+  start_sequence ();
 
-  x = XEXP (operands[0], 0);
-  if (!REG_P (x))
-    x = XEXP (replace_equiv_address (operands[0], force_reg (Pmode, x)), 0);
-  dst = gen_reg_rtx (SImode);
-  emit_move_insn (dst, x);
-  end = gen_reg_rtx (SImode);
-  if (TARGET_LOOPS && optimize)
-    x = force_reg (SImode, operands[1] /* the length */);
+  reg = XEXP (dst_mem, 0);
+  if (REG_P (reg))
+    emit_move_insn (dst = gen_reg_rtx (SImode), reg);
   else
-    x = operands[1];
-  emit_insn (gen_addsi3 (end, dst, x));
+    dst = XEXP (replace_equiv_address (dst_mem,
+				       force_reg (Pmode, reg)), 0);
+  emit_insn (gen_addsi3 (end = gen_reg_rtx (SImode),
+			 dst,
+			 (TARGET_LOOPS && optimize)
+			  ? force_reg (SImode, GEN_INT (bytes))
+			  : GEN_INT (bytes)));
   switch (align)
     {
     case 1:
@@ -1760,12 +1694,99 @@  xtensa_expand_block_set_small_loop (rtx *operands)
     }
   reg = force_reg (unit_mode, GEN_INT (value));
 
-  label = gen_label_rtx ();
-  emit_label (label);
+  emit_label (label = gen_label_rtx ());
   emit_move_insn (gen_rtx_MEM (unit_mode, dst), reg);
   emit_insn (gen_addsi3 (dst, dst, GEN_INT (align)));
   emit_cmp_and_jump_insns (dst, end, NE, const0_rtx, SImode, true, label);
 
+  seq = get_insns ();
+  end_sequence ();
+
+  return seq;
+}
+
+
+/* Try to expand a block set operation to a sequence of RTL move
+   instructions.  If not optimizing, or if the block size is not a
+   constant, or if the block is too large, or if the value to
+   initialize the block with is not a constant, the expansion
+   fails and GCC falls back to calling memset().
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[2] is the initialization value
+   operands[3] is the alignment */
+
+int
+xtensa_expand_block_set (rtx *operands)
+{
+  rtx dst_mem = operands[0];
+  HOST_WIDE_INT bytes, value, align;
+  rtx_insn *seq[3];
+  int min_cost, min_index, i, n, cost;
+  rtx_insn *insn;
+
+  if (! CONST_INT_P (operands[1])
+      || ! CONST_INT_P (operands[2])
+      || (bytes = INTVAL (operands[1])) <= 0)
+    return 0;
+
+  value = (int8_t)INTVAL (operands[2]);
+  align = INTVAL (operands[3]);
+  if (align > MOVE_MAX)
+    align = MOVE_MAX;
+
+  /* Try to generate three equivalent insn sequences but method and
+     size.  */
+  seq[0] = xtensa_expand_block_set_libcall (dst_mem, value, bytes);
+  seq[1] = xtensa_expand_block_set_unrolled_loop (dst_mem, value,
+						  bytes, align);
+  seq[2] = xtensa_expand_block_set_small_loop (dst_mem, value,
+					       bytes, align);
+
+  /* Find the sequence that has minimum size-basis insn costs.  */
+  if (dump_file)
+    fprintf (dump_file, "xtensa_expand_block_set:\n");
+  min_cost = INT_MAX, min_index = 0;
+  for (i = 0; i < 3; ++i)
+    if ((insn = seq[i]))
+      {
+	if (dump_file)
+	  fprintf (dump_file, " method %d...\n", i);
+
+	for (n = 0, cost = 0; insn; insn = NEXT_INSN (insn))
+	  {
+	    if (active_insn_p (insn))
+	      ++n, cost += xtensa_insn_cost (insn, false);
+	    if (dump_file)
+	      dump_insn_slim (dump_file, insn);
+	  }
+
+	/* Apply expansion bonus if -O2 or -O3 by discounting the cost
+	   other than libcall.  */
+	if (i > 0)
+	  {
+	    if (optimize == 2 && !optimize_size)
+	      cost = (cost + 1) / 2;
+	    else if (optimize >= 3)
+	      cost = (cost + 2) / 4;
+	  }
+
+	if (dump_file)
+	  fprintf (dump_file, "\t%d active insns, %d cost.\n", n, cost);
+
+	if (cost < min_cost)
+	  min_cost = cost, min_index = i;
+      }
+  if (dump_file)
+    fprintf (dump_file, " choose method %d.\n", min_index);
+
+  /* Fall back if libcall is minimum.  */
+  if (min_index == 0)
+    return 0;
+
+  emit_insn (seq[min_index]);
+
   return 1;
 }
 
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 113b313026e..57e50911f52 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1547,9 +1547,7 @@ 
    (match_operand:SI 3 "const_int_operand")]
   "!optimize_debug && optimize"
 {
-  if (xtensa_expand_block_set_unrolled_loop (operands))
-    DONE;
-  if (xtensa_expand_block_set_small_loop (operands))
+  if (xtensa_expand_block_set (operands))
     DONE;
   FAIL;
 })