[v3,4/5] xtensa: Add setmemsi insn pattern

Message ID 87b80e93-0031-d847-9120-ceccd79c1a37@yahoo.co.jp
State New
Headers
Series None |

Commit Message

Takayuki 'January June' Suwa May 23, 2022, 3:52 p.m. UTC
  This patch introduces setmemsi insn pattern of two kinds, unrolled loop and
small loop, for fixed small length and constant initialization value.

gcc/ChangeLog:

	* gcc/config/xtensa/xtensa-protos.h
	(xtensa_expand_block_set_unrolled_loop,
	xtensa_expand_block_set_small_loop): New prototypes.
	* gcc/config/xtensa/xtensa.cc (xtensa_sizeof_MOVI,
	xtensa_expand_block_set_unrolled_loop,
	xtensa_expand_block_set_small_loop): New functions.
	* gcc/config/xtensa/xtensa.md (setmemsi): New expansion pattern.
	* gcc/config/xtensa/xtensa.opt (mlongcalls): Add target mask.
---
  gcc/config/xtensa/xtensa-protos.h |   2 +
  gcc/config/xtensa/xtensa.cc       | 211 ++++++++++++++++++++++++++++++
  gcc/config/xtensa/xtensa.md       |  16 +++
  gcc/config/xtensa/xtensa.opt      |   2 +-
  4 files changed, 230 insertions(+), 1 deletion(-)
  

Comments

Max Filippov May 26, 2022, 4:57 p.m. UTC | #1
On Mon, May 23, 2022 at 8:52 AM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> This patch introduces setmemsi insn pattern of two kinds, unrolled loop and
> small loop, for fixed small length and constant initialization value.
>
> gcc/ChangeLog:
>
>         * gcc/config/xtensa/xtensa-protos.h
>         (xtensa_expand_block_set_unrolled_loop,
>         xtensa_expand_block_set_small_loop): New prototypes.
>         * gcc/config/xtensa/xtensa.cc (xtensa_sizeof_MOVI,
>         xtensa_expand_block_set_unrolled_loop,
>         xtensa_expand_block_set_small_loop): New functions.
>         * gcc/config/xtensa/xtensa.md (setmemsi): New expansion pattern.
>         * gcc/config/xtensa/xtensa.opt (mlongcalls): Add target mask.
> ---
>   gcc/config/xtensa/xtensa-protos.h |   2 +
>   gcc/config/xtensa/xtensa.cc       | 211 ++++++++++++++++++++++++++++++
>   gcc/config/xtensa/xtensa.md       |  16 +++
>   gcc/config/xtensa/xtensa.opt      |   2 +-
>   4 files changed, 230 insertions(+), 1 deletion(-)

With this patch applied for the following test program

void f(char *p);

void g(void)
{
       char c[72] = {0};
       f(c);
}

the following code is generated with -O2:

       .text
       .literal_position
       .literal .LC0, f@PLT
       .align  4
       .global g
       .type   g, @function
g:
       entry   sp, 112
       movi.n  a10, 0
       s32i.n  a10, sp, 0
       addi.n  a9, sp, 4
       movi.n  a8, 0x11
       loop    a8, .L2_LEND
.L2:
       s32i.n  a10, a9, 0
       addi.n  a9, a9, 4
       .L2_LEND:
       l32r    a8, .LC0
       mov.n   a10, sp
       callx8  a8
       retw.n

The part

       s32i.n  a10, sp, 0
       addi.n  a9, sp, 4
       movi.n  a8, 0x11

looks redundant and could be just

mov a9, sp
movi a8, 0x12

is that something that can be addressed in this patch?
  
Takayuki 'January June' Suwa May 27, 2022, 3 a.m. UTC | #2
On 2022/05/27 1:57, Max Filippov wrote:
> is that something that can be addressed in this patch?

seems hard to resolve, because the RTL-generation pass passes only 68 
bytes in that case:

> void f(char *p);
> 
> void g(void)
> {
>        char c[72] = {0};
>        f(c);
> }

without this patch, we would get as:

g:
	entry	sp, 112
	movi.n	a8, 0
	movi.n	a12, 0x44	; 68, not 72
	mov.n	a11, a8
	addi.n	a10, sp, 4	; skipped first 4 bytes
	s32i.n	a8, sp, 0	; cleared without using memset()
	call8	memset
	mov.n	a10, sp
	call8	f
	retw.n

parhaps, it can be solved it by using peephole2 pattern... (depends on 
whether peephole2 can capture code_label)

this behavior does not occur in configuration without zero-overhead 
loop, eg. in xtensa-lx106 (ESP8266 SoC):

g:
	addi	sp, sp, -96
	movi.n	a3, 0
	s32i	a0, sp, 92
	s32i.n	a3, sp, 0
	addi.n	a2, sp, 4
	addi	a4, sp, 72
.L2:
	s32i.n	a3, a2, 0
	addi.n	a2, a2, 4
	bne	a2, a4, .L2
	mov.n	a2, sp
	call0	f
	l32i	a0, sp, 92
	addi	sp, sp, 96
	ret.n

in x86_64-linux:

g:
.LFB0:
	.cfi_startproc
	subq	$88, %rsp
	.cfi_def_cfa_offset 96
	pxor	%xmm0, %xmm0
	movq	%rsp, %rdi
	movaps	%xmm0, (%rsp)
	movaps	%xmm0, 16(%rsp)
	movaps	%xmm0, 32(%rsp)
	movaps	%xmm0, 48(%rsp)
	movq	$0, 64(%rsp)
	call	f@PLT
	addq	$88, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:

or, dword-aligned element:

void f(int *p);
void g(void)
{
         int c[18] = { 0 };
         f(c);
}
  
Max Filippov May 27, 2022, 4:57 a.m. UTC | #3
On Thu, May 26, 2022 at 8:00 PM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> On 2022/05/27 1:57, Max Filippov wrote:
> > is that something that can be addressed in this patch?
>
> seems hard to resolve, because the RTL-generation pass passes only 68
> bytes in that case:
...
> this behavior does not occur in configuration without zero-overhead
> loop, eg. in xtensa-lx106 (ESP8266 SoC):

Ok, I'll commit it as is then.
  
Max Filippov May 27, 2022, 4:58 a.m. UTC | #4
On Mon, May 23, 2022 at 8:52 AM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> This patch introduces setmemsi insn pattern of two kinds, unrolled loop and
> small loop, for fixed small length and constant initialization value.
>
> gcc/ChangeLog:
>
>         * gcc/config/xtensa/xtensa-protos.h
>         (xtensa_expand_block_set_unrolled_loop,
>         xtensa_expand_block_set_small_loop): New prototypes.
>         * gcc/config/xtensa/xtensa.cc (xtensa_sizeof_MOVI,
>         xtensa_expand_block_set_unrolled_loop,
>         xtensa_expand_block_set_small_loop): New functions.
>         * gcc/config/xtensa/xtensa.md (setmemsi): New expansion pattern.
>         * gcc/config/xtensa/xtensa.opt (mlongcalls): Add target mask.
> ---
>   gcc/config/xtensa/xtensa-protos.h |   2 +
>   gcc/config/xtensa/xtensa.cc       | 211 ++++++++++++++++++++++++++++++
>   gcc/config/xtensa/xtensa.md       |  16 +++
>   gcc/config/xtensa/xtensa.opt      |   2 +-
>   4 files changed, 230 insertions(+), 1 deletion(-)

Regtested for target=xtensa-linux-uclibc, no new regressions.
Changelog has extra 'gcc/' in paths, so I've dropped this part.
Committed to master.
  
Takayuki 'January June' Suwa May 27, 2022, 9:23 a.m. UTC | #5
On 2022/05/27 12:00, Takayuki 'January June' Suwa via Gcc-patches wrote:
> On 2022/05/27 1:57, Max Filippov wrote:
>> is that something that can be addressed in this patch?
> 
> seems hard to resolve, because the RTL-generation pass passes only 68 
> bytes in that case:

the culprit is here, but i don't know whether it is known regression or not.

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 7197996cec7..be100dd9946 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -6043,13 +6043,19 @@ store_expr (tree exp, rtx target, int call_param_p,
        if (!can_store_by_pieces (str_copy_len, string_cst_read_str,
  				(void *) str, MEM_ALIGN (target), false))
  	goto normal_expr;
-
-      dest_mem = store_by_pieces (target, str_copy_len, 
string_cst_read_str,
-				  (void *) str, MEM_ALIGN (target), false,
-				  RETURN_END);
-      clear_storage (adjust_address_1 (dest_mem, BLKmode, 0, 1, 1, 0,
-				       exp_len - str_copy_len),
-		     GEN_INT (exp_len - str_copy_len), BLOCK_OP_NORMAL);
+      if (TREE_STRING_LENGTH (str) == 1 && *TREE_STRING_POINTER (str) == 0)
+	clear_storage (adjust_address_1 (target, BLKmode, 0, 1, 1, 0,
+					 exp_len),
+		       GEN_INT (exp_len), BLOCK_OP_NORMAL);
+      else
+	{
+	  dest_mem = store_by_pieces (target, str_copy_len, string_cst_read_str,
+				      (void *) str, MEM_ALIGN (target), false,
+				      RETURN_END);
+	  clear_storage (adjust_address_1 (dest_mem, BLKmode, 0, 1, 1, 0,
+					   exp_len - str_copy_len),
+			 GEN_INT (exp_len - str_copy_len), BLOCK_OP_NORMAL);
+	}
        return NULL_RTX;
      }
    else
  

Patch

diff --git a/gcc/config/xtensa/xtensa-protos.h 
b/gcc/config/xtensa/xtensa-protos.h
index 4bc42da2320..30e4b54394a 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -41,6 +41,8 @@  extern void xtensa_expand_conditional_branch (rtx *, 
machine_mode);
  extern int xtensa_expand_conditional_move (rtx *, int);
  extern int xtensa_expand_scc (rtx *, machine_mode);
  extern int xtensa_expand_block_move (rtx *);
+extern int xtensa_expand_block_set_unrolled_loop (rtx *);
+extern int xtensa_expand_block_set_small_loop (rtx *);
  extern void xtensa_split_operand_pair (rtx *, machine_mode);
  extern int xtensa_emit_move_sequence (rtx *, machine_mode);
  extern rtx xtensa_copy_incoming_a7 (rtx);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index d2aabf38339..c7b54babc37 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -1373,6 +1373,217 @@  xtensa_expand_block_move (rtx *operands)
  }


+/* Try to expand a block set operation to a sequence of RTL move
+   instructions.  If not optimizing, or if the block size is not a
+   constant, or if the block is too large, or if the value to
+   initialize the block with is not a constant, the expansion
+   fails and GCC falls back to calling memset().
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[2] is the initialization value
+   operands[3] is the alignment */
+
+static int
+xtensa_sizeof_MOVI (HOST_WIDE_INT imm)
+{
+  return (TARGET_DENSITY && IN_RANGE (imm, -32, 95)) ? 2 : 3;
+}
+
+int
+xtensa_expand_block_set_unrolled_loop (rtx *operands)
+{
+  rtx dst_mem = operands[0];
+  HOST_WIDE_INT bytes, value, align;
+  int expand_len, funccall_len;
+  rtx x, reg;
+  int offset;
+
+  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
+    return 0;
+
+  bytes = INTVAL (operands[1]);
+  if (bytes <= 0)
+    return 0;
+  value = (int8_t)INTVAL (operands[2]);
+  align = INTVAL (operands[3]);
+  if (align > MOVE_MAX)
+    align = MOVE_MAX;
+
+  /* Insn expansion: holding the init value.
+     Either MOV(.N) or L32R w/litpool.  */
+  if (align == 1)
+    expand_len = xtensa_sizeof_MOVI (value);
+  else if (value == 0 || value == -1)
+    expand_len = TARGET_DENSITY ? 2 : 3;
+  else
+    expand_len = 3 + 4;
+  /* Insn expansion: a series of aligned memory stores.
+     Consist of S8I, S16I or S32I(.N).  */
+  expand_len += (bytes / align) * (TARGET_DENSITY
+				   && align == 4 ? 2 : 3);
+  /* Insn expansion: the remainder, sub-aligned memory stores.
+     A combination of S8I and S16I as needed.  */
+  expand_len += ((bytes % align + 1) / 2) * 3;
+
+  /* Function call: preparing two arguments.  */
+  funccall_len = xtensa_sizeof_MOVI (value);
+  funccall_len += xtensa_sizeof_MOVI (bytes);
+  /* Function call: calling memset().  */
+  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
+
+  /* Apply expansion bonus (2x) if optimizing for speed.  */
+  if (optimize > 1 && !optimize_size)
+    funccall_len *= 2;
+
+  /* Decide whether to expand or not, based on the sum of the length
+     of instructions.  */
+  if (expand_len > funccall_len)
+    return 0;
+
+  x = XEXP (dst_mem, 0);
+  if (!REG_P (x))
+    dst_mem = replace_equiv_address (dst_mem, force_reg (Pmode, x));
+  switch (align)
+    {
+    case 1:
+      break;
+    case 2:
+      value = (int16_t)((uint8_t)value * 0x0101U);
+      break;
+    case 4:
+      value = (int32_t)((uint8_t)value * 0x01010101U);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  reg = force_reg (SImode, GEN_INT (value));
+
+  offset = 0;
+  do
+    {
+      int unit_size = MIN (bytes, align);
+      machine_mode unit_mode = (unit_size >= 4 ? SImode :
+			       (unit_size >= 2 ? HImode :
+						 QImode));
+      unit_size = GET_MODE_SIZE (unit_mode);
+
+      emit_move_insn (adjust_address (dst_mem, unit_mode, offset),
+		      unit_mode == SImode ? reg
+		      : convert_to_mode (unit_mode, reg, true));
+
+      offset += unit_size;
+      bytes -= unit_size;
+    }
+  while (bytes > 0);
+
+  return 1;
+}
+
+int
+xtensa_expand_block_set_small_loop (rtx *operands)
+{
+  HOST_WIDE_INT bytes, value, align;
+  int expand_len, funccall_len;
+  rtx x, dst, end, reg;
+  machine_mode unit_mode;
+  rtx_code_label *label;
+
+  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
+    return 0;
+
+  bytes = INTVAL (operands[1]);
+  if (bytes <= 0)
+    return 0;
+  value = (int8_t)INTVAL (operands[2]);
+  align = INTVAL (operands[3]);
+  if (align > MOVE_MAX)
+    align = MOVE_MAX;
+
+  /* Totally-aligned block only.  */
+  if (bytes % align != 0)
+    return 0;
+
+  /* If 4-byte aligned, small loop substitution is almost optimal, thus
+     limited to only offset to the end address for ADDI/ADDMI 
instruction.  */
+  if (align == 4
+      && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
+    return 0;
+
+  /* If no 4-byte aligned, loop count should be treated as the 
constraint.  */
+  if (align != 4
+      && bytes / align > ((optimize > 1 && !optimize_size) ? 8 : 15))
+    return 0;
+
+  /* Insn expansion: holding the init value.
+     Either MOV(.N) or L32R w/litpool.  */
+  if (align == 1)
+    expand_len = xtensa_sizeof_MOVI (value);
+  else if (value == 0 || value == -1)
+    expand_len = TARGET_DENSITY ? 2 : 3;
+  else
+    expand_len = 3 + 4;
+  /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
+  expand_len += bytes > 127 ? 3
+			    : (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
+
+  /* Insn expansion: the loop body and branch instruction.
+     For store, one of S8I, S16I or S32I(.N).
+     For advance, ADDI(.N).
+     For branch, BNE.  */
+  expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		+ (TARGET_DENSITY ? 2 : 3) + 3;
+
+  /* Function call: preparing two arguments.  */
+  funccall_len = xtensa_sizeof_MOVI (value);
+  funccall_len += xtensa_sizeof_MOVI (bytes);
+  /* Function call: calling memset().  */
+  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
+
+  /* Apply expansion bonus (2x) if optimizing for speed.  */
+  if (optimize > 1 && !optimize_size)
+    funccall_len *= 2;
+
+  /* Decide whether to expand or not, based on the sum of the length
+     of instructions.  */
+  if (expand_len > funccall_len)
+    return 0;
+
+  x = XEXP (operands[0], 0);
+  if (!REG_P (x))
+    x = XEXP (replace_equiv_address (operands[0], force_reg (Pmode, 
x)), 0);
+  dst = gen_reg_rtx (SImode);
+  emit_move_insn (dst, x);
+  end = gen_reg_rtx (SImode);
+  emit_insn (gen_addsi3 (end, dst, operands[1] /* the length */));
+  switch (align)
+    {
+    case 1:
+      unit_mode = QImode;
+      break;
+    case 2:
+      value = (int16_t)((uint8_t)value * 0x0101U);
+      unit_mode = HImode;
+      break;
+    case 4:
+      value = (int32_t)((uint8_t)value * 0x01010101U);
+      unit_mode = SImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  reg = force_reg (unit_mode, GEN_INT (value));
+
+  label = gen_label_rtx ();
+  emit_label (label);
+  emit_move_insn (gen_rtx_MEM (unit_mode, dst), reg);
+  emit_insn (gen_addsi3 (dst, dst, GEN_INT (align)));
+  emit_cmp_and_jump_insns (dst, end, NE, const0_rtx, SImode, true, label);
+
+  return 1;
+}
+
+
  void
  xtensa_expand_nonlocal_goto (rtx *operands)
  {
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 96e043b26b5..2d146b7995c 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1080,6 +1080,22 @@ 
    DONE;
  })

+;; Block sets
+
+(define_expand "setmemsi"
+  [(match_operand:BLK 0 "memory_operand")
+   (match_operand:SI 1 "")
+   (match_operand:SI 2 "")
+   (match_operand:SI 3 "const_int_operand")]
+  "!optimize_debug && optimize"
+{
+  if (xtensa_expand_block_set_unrolled_loop (operands))
+    DONE;
+  if (xtensa_expand_block_set_small_loop (operands))
+    DONE;
+  FAIL;
+})
+
  
  ;; Shift instructions.

diff --git a/gcc/config/xtensa/xtensa.opt b/gcc/config/xtensa/xtensa.opt
index c406297af0d..1fc68a3d994 100644
--- a/gcc/config/xtensa/xtensa.opt
+++ b/gcc/config/xtensa/xtensa.opt
@@ -27,7 +27,7 @@  Target Mask(FORCE_NO_PIC)
  Disable position-independent code (PIC) for use in OS kernel code.

  mlongcalls
-Target
+Target Mask(LONGCALLS)
  Use indirect CALLXn instructions for large programs.

  mtarget-align