[3/3,v2] xtensa: Optimize 'cstoresi4' insn pattern

Message ID b861d41b-b48e-6e3e-8e62-dd21d0362101@yahoo.co.jp
State New
Headers
Series None |

Commit Message

Takayuki 'January June' Suwa May 30, 2023, 9:51 a.m. UTC
  Resubmitting the correct one due to a mistake in merging order of fixes.
---
This patch introduces more optimized implementations for the 6 cstoresi4
insn comparison methods (eq/ne/lt/le/gt/ge, however, required TARGET_NSA
for eq).

gcc/ChangeLog:

	* config/xtensa/xtensa.cc (xtensa_expand_scc):
	Add dedicated optimization code for cstoresi4 (eq/ne/gt/ge/lt/le).
	* config/xtensa/xtensa.md (xtensa_ge_zero):
	Rename from '*signed_ge_zero', because it had to be called from
	'xtensa_expand_scc()'.
---
 gcc/config/xtensa/xtensa.cc | 106 ++++++++++++++++++++++++++++++++----
 gcc/config/xtensa/xtensa.md |   2 +-
 2 files changed, 96 insertions(+), 12 deletions(-)
  

Comments

Max Filippov May 31, 2023, 4:37 a.m. UTC | #1
Hi Suwa-san,

On Tue, May 30, 2023 at 2:51 AM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> Resubmitting the correct one due to a mistake in merging order of fixes.
> ---
> This patch introduces more optimized implementations for the 6 cstoresi4
> insn comparison methods (eq/ne/lt/le/gt/ge, however, required TARGET_NSA
> for eq).
>
> gcc/ChangeLog:
>
>         * config/xtensa/xtensa.cc (xtensa_expand_scc):
>         Add dedicated optimization code for cstoresi4 (eq/ne/gt/ge/lt/le).
>         * config/xtensa/xtensa.md (xtensa_ge_zero):
>         Rename from '*signed_ge_zero', because it had to be called from
>         'xtensa_expand_scc()'.
> ---
>  gcc/config/xtensa/xtensa.cc | 106 ++++++++++++++++++++++++++++++++----
>  gcc/config/xtensa/xtensa.md |   2 +-
>  2 files changed, 96 insertions(+), 12 deletions(-)

This change introduces a bunch of testsuite failures:

+FAIL: gcc.c-torture/execute/20070623-1.c   -O0  execution test
+FAIL: gcc.c-torture/execute/20070623-1.c   -O1  execution test
+FAIL: gcc.c-torture/execute/20070623-1.c   -O2  execution test
+FAIL: gcc.c-torture/execute/20070623-1.c   -O3 -g  execution test
+FAIL: gcc.c-torture/execute/20070623-1.c   -Os  execution test
+FAIL: gcc.c-torture/execute/20070623-1.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: gcc.c-torture/execute/20070623-1.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  execution test
+FAIL: gcc.c-torture/execute/920612-1.c   -O0  execution test
+FAIL: gcc.c-torture/execute/920612-1.c   -O1  execution test
+FAIL: gcc.c-torture/execute/920612-1.c   -O2  execution test
+FAIL: gcc.c-torture/execute/920612-1.c   -O3 -g  execution test
+FAIL: gcc.c-torture/execute/920612-1.c   -Os  execution test
+FAIL: gcc.c-torture/execute/920612-1.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: gcc.c-torture/execute/int-compare.c   -O0  execution test
+FAIL: gcc.c-torture/execute/int-compare.c   -O1  execution test
+FAIL: gcc.c-torture/execute/int-compare.c   -O2  execution test
+FAIL: gcc.c-torture/execute/int-compare.c   -O3 -g  execution test
+FAIL: gcc.c-torture/execute/int-compare.c   -Os  execution test
+FAIL: gcc.c-torture/execute/int-compare.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: gcc.c-torture/execute/pr28651.c   -O0  execution test
+FAIL: gcc.c-torture/execute/pr28651.c   -O1  execution test
+FAIL: gcc.c-torture/execute/pr28651.c   -O2  execution test
+FAIL: gcc.c-torture/execute/pr28651.c   -O3 -g  execution test
+FAIL: gcc.c-torture/execute/pr28651.c   -Os  execution test
+FAIL: gcc.c-torture/execute/pr28651.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: gcc.c-torture/execute/pr55137.c   -O0  execution test
+FAIL: gcc.c-torture/execute/pr55137.c   -O1  execution test
+FAIL: gcc.c-torture/execute/pr55137.c   -O2  execution test
+FAIL: gcc.c-torture/execute/pr55137.c   -O3 -g  execution test
+FAIL: gcc.c-torture/execute/pr55137.c   -Os  execution test
+FAIL: gcc.c-torture/execute/pr55137.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: gcc.dg/pr61045.c execution test
+FAIL: gcc.dg/signbit-6.c execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-12.c   -O2  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-12.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-12.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-13.c   -O2  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-13.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-13.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-14.c   -O2  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-14.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-p-14.c   -O2  execution test
+FAIL: c-c++-common/torture/builtin-arith-overflow-p-14.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  execution test
+FAIL: gcc.dg/torture/pr49958.c   -O0  execution test
+FAIL: gcc.dg/torture/pr49958.c   -O1  execution test
+FAIL: gcc.dg/torture/pr49958.c   -O2  execution test
+FAIL: gcc.dg/torture/pr49958.c   -O3 -g  execution test
+FAIL: gcc.dg/torture/pr49958.c   -Os  execution test
+FAIL: gcc.dg/torture/pr49958.c   -O2 -flto -fno-use-linker-plugin
-flto-partition=none  execution test
+FAIL: gcc.dg/tree-ssa/pr68714.c (internal compiler error: in
decompose, at rtl.h:2297)
+FAIL: gcc.dg/tree-ssa/pr68714.c (test for excess errors)
+FAIL: gcc.dg/tree-ssa/pr81346-4.c execution test

> diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
> index 3b5d25b660a..64efd3d7287 100644
> --- a/gcc/config/xtensa/xtensa.cc
> +++ b/gcc/config/xtensa/xtensa.cc
> @@ -991,24 +991,108 @@ xtensa_expand_conditional_move (rtx *operands, int isflt)
>  int
>  xtensa_expand_scc (rtx operands[4], machine_mode cmp_mode)
>  {
> -  rtx dest = operands[0];
> -  rtx cmp;
> -  rtx one_tmp, zero_tmp;
> +  rtx dest = operands[0], op0 = operands[2], op1 = operands[3];
> +  enum rtx_code code = GET_CODE (operands[1]);
> +  rtx cmp, tmp0, tmp1;
>    rtx (*gen_fn) (rtx, rtx, rtx, rtx, rtx);
>
> -  if (!(cmp = gen_conditional_move (GET_CODE (operands[1]), cmp_mode,
> -                                   operands[2], operands[3])))
> -    return 0;
> +  /* Dedicated optimizations for cstoresi4.
> +     a. In a magnitude comparison operator, swapping both sides and
> +       inverting magnitude does not change the result,
> +       eg. '(x >= y) != (y <= x)' is a constant of zero
> +       (GE is changed to LE, not LT).
> +     b. Due to room for further optimization, we use subtraction rather
> +       than XOR (the default for RTL expansion of EQ/NE) as the binary
> +       operation which is zero if both sides are the same and non-zero
> +       otherwise.  */
> +  if (cmp_mode == SImode)
> +    switch (code)
> +      {
> +      /* EQ(op0, op1) := clz(op0 - op1) / 32 [requires TARGET_NSA] */
> +      case EQ:
> +       if (!TARGET_NSA)
> +         break;
> +       /* EQ to EQZ conversion by subtracting op1 from op0.  */
> +       emit_move_insn (dest,
> +                       expand_binop (SImode, sub_optab, op0, op1,
> +                                     0, 0, OPTAB_LIB_WIDEN));
> +       /* NSAU instruction will return 32 iff the source is zero,
> +          zero through 31 otherwise (See Xtensa ISA Reference Manual,
> +          p. 462)  */
> +       emit_insn (gen_clzsi2 (dest, dest));
> +       emit_insn (gen_lshrsi3 (dest, dest, GEN_INT (5)));
> +       return 1;
> +
> +      /* NE(op0, op1) := (op0 - op1) == 0 ? 0 : 1 */
> +      case NE:
> +       /* NE to NEZ conversion by subtracting op1 from op0.  */
> +       emit_move_insn (tmp0 = gen_reg_rtx (SImode),
> +                       expand_binop (SImode, sub_optab, op0, op1,
> +                                     0, 0, OPTAB_LIB_WIDEN));
> +       emit_move_insn (dest, const_true_rtx);
> +       emit_move_insn (dest,
> +                       gen_rtx_fmt_eee (IF_THEN_ELSE, SImode,
> +                                        gen_rtx_fmt_ee (EQ, VOIDmode,
> +                                                        tmp0, const0_rtx),
> +                                        tmp0, dest));
> +       return 1;
> +
> +      case LE:
> +       if (REG_P (op1))
> +         {
> +           /* LE to GE conversion by swapping both sides.  */
> +           tmp0 = op0, op0 = op1, op1 = tmp0;
> +           goto case_GE_reg;
> +         }
> +       /* LE to LT conversion by adding one to op1.  */
> +       op1 = GEN_INT (INTVAL (op1) + 1);
> +       /* fallthru */
> +
> +      /* LT(op0, op1) := (unsigned)(op0 - op1) >> 31 */

This doesn't work (as demonstrated by the gcc.c-torture/execute/20070623-1.c)
when an overflow occurs, e.g. for op0 == INT_MIN, op1 == INT_MAX.

Maybe the dedicated instructions salt / saltu could be used in that pattern?
They don't have their own XCHAL_* macros, but according to the ISA book
they were introduced in RG-2015.0, which I believe could be tested as follows:

#define TARGET_SALT (XTENSA_MARCH_EARLIEST >= 270000)
  

Patch

diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 3b5d25b660a..64efd3d7287 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -991,24 +991,108 @@  xtensa_expand_conditional_move (rtx *operands, int isflt)
 int
 xtensa_expand_scc (rtx operands[4], machine_mode cmp_mode)
 {
-  rtx dest = operands[0];
-  rtx cmp;
-  rtx one_tmp, zero_tmp;
+  rtx dest = operands[0], op0 = operands[2], op1 = operands[3];
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx cmp, tmp0, tmp1;
   rtx (*gen_fn) (rtx, rtx, rtx, rtx, rtx);
 
-  if (!(cmp = gen_conditional_move (GET_CODE (operands[1]), cmp_mode,
-				    operands[2], operands[3])))
-    return 0;
+  /* Dedicated optimizations for cstoresi4.
+     a. In a magnitude comparison operator, swapping both sides and
+	inverting magnitude does not change the result,
+	eg. '(x >= y) != (y <= x)' is a constant of zero
+	(GE is changed to LE, not LT).
+     b. Due to room for further optimization, we use subtraction rather
+	than XOR (the default for RTL expansion of EQ/NE) as the binary
+	operation which is zero if both sides are the same and non-zero
+	otherwise.  */
+  if (cmp_mode == SImode)
+    switch (code)
+      {
+      /* EQ(op0, op1) := clz(op0 - op1) / 32 [requires TARGET_NSA] */
+      case EQ:
+	if (!TARGET_NSA)
+	  break;
+	/* EQ to EQZ conversion by subtracting op1 from op0.  */
+	emit_move_insn (dest,
+			expand_binop (SImode, sub_optab, op0, op1,
+				      0, 0, OPTAB_LIB_WIDEN));
+	/* NSAU instruction will return 32 iff the source is zero,
+	   zero through 31 otherwise (See Xtensa ISA Reference Manual,
+	   p. 462)  */
+	emit_insn (gen_clzsi2 (dest, dest));
+	emit_insn (gen_lshrsi3 (dest, dest, GEN_INT (5)));
+	return 1;
+
+      /* NE(op0, op1) := (op0 - op1) == 0 ? 0 : 1 */
+      case NE:
+	/* NE to NEZ conversion by subtracting op1 from op0.  */
+	emit_move_insn (tmp0 = gen_reg_rtx (SImode),
+			expand_binop (SImode, sub_optab, op0, op1,
+				      0, 0, OPTAB_LIB_WIDEN));
+	emit_move_insn (dest, const_true_rtx);
+	emit_move_insn (dest,
+			gen_rtx_fmt_eee (IF_THEN_ELSE, SImode,
+					 gen_rtx_fmt_ee (EQ, VOIDmode,
+							 tmp0, const0_rtx),
+					 tmp0, dest));
+	return 1;
+
+      case LE:
+	if (REG_P (op1))
+	  {
+	    /* LE to GE conversion by swapping both sides.  */
+	    tmp0 = op0, op0 = op1, op1 = tmp0;
+	    goto case_GE_reg;
+	  }
+	/* LE to LT conversion by adding one to op1.  */
+	op1 = GEN_INT (INTVAL (op1) + 1);
+	/* fallthru */
+
+      /* LT(op0, op1) := (unsigned)(op0 - op1) >> 31 */
+      case LT:
+case_LT:
+	/* LT to LTZ conversion by subtracting op1 from op0.  */
+	emit_move_insn (dest,
+			expand_binop (SImode, sub_optab, op0, op1,
+				      0, 0, OPTAB_LIB_WIDEN));
+	emit_insn (gen_lshrsi3 (dest, dest, GEN_INT (31)));
+	return 1;
+
+      case GE:
+	if (REG_P (op1))
+	  {
+case_GE_reg:
+	    /* GE to GEZ conversion by subtracting op1 from op0.  */
+	    emit_move_insn (dest,
+			    expand_binop (SImode, sub_optab, op0, op1,
+					  0, 0, OPTAB_LIB_WIDEN));
+	    /* Emitting the dedicated insn pattern.  */
+	    emit_insn (gen_xtensa_ge_zero (dest, dest));
+	    return 1;
+	  }
+	/* GE to GT conversion by subtracting one from op1.  */
+	op1 = GEN_INT (INTVAL (op1) - 1);
+	/* fallthru */
 
-  one_tmp = gen_reg_rtx (SImode);
-  zero_tmp = gen_reg_rtx (SImode);
-  emit_insn (gen_movsi (one_tmp, const_true_rtx));
-  emit_insn (gen_movsi (zero_tmp, const0_rtx));
+      case GT:
+	/* GT to LT conversion by swapping both sides.  */
+	tmp0 = op0, op0 = op1, op1 = tmp0;
+	goto case_LT;
 
+      default:
+	break;
+      }
+
+  if (! (cmp = gen_conditional_move (code, cmp_mode, op0, op1)))
+    return 0;
+
+  tmp0 = force_reg (SImode, const0_rtx);
+  tmp1 = force_reg (SImode, const_true_rtx);
   gen_fn = (cmp_mode == SImode
 	    ? gen_movsicc_internal0
 	    : gen_movsicc_internal1);
-  emit_insn (gen_fn (dest, XEXP (cmp, 0), one_tmp, zero_tmp, cmp));
+  emit_insn (gen_fn (dest, XEXP (cmp, 0), tmp1, tmp0, cmp));
+
   return 1;
 }
 
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 6882baaedfd..ebc305bd387 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -3136,7 +3136,7 @@ 
 		      (const_int 5)
 		      (const_int 6)))])
 
-(define_insn_and_split "*signed_ge_zero"
+(define_insn_and_split "xtensa_ge_zero"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(ge:SI (match_operand:SI 1 "register_operand" "r")
 	       (const_int 0)))]