[pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]

Message ID mpto7ffw8yv.fsf@arm.com
State Committed
Commit e09007308c96a036a4a4e6fd4d6c09442b4c4420
Headers
Series [pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 warning Patch is already merged

Commit Message

Richard Sandiford Nov. 27, 2023, 2:45 p.m. UTC
  Many predicated SVE intrinsics provide three forms of predication:
zeroing, merging, and any/dont-care.  All three are equivalent when
the predicate is all-true, so this patch drops the zeroing and
merging in that case.

Tested on aarch64-linux-gnu & pushed.

Richard


gcc/
	PR target/106326
	* config/aarch64/aarch64-sve-builtins.h (is_ptrue): Declare.
	* config/aarch64/aarch64-sve-builtins.cc (is_ptrue): New function.
	(gimple_folder::redirect_pred_x): Likewise.
	(gimple_folder::fold): Use it.

gcc/testsuite/
	PR target/106326
	* gcc.target/aarch64/sve/acle/general/pr106326_1.c: New test.
---
 gcc/config/aarch64/aarch64-sve-builtins.cc    |  46 +++
 gcc/config/aarch64/aarch64-sve-builtins.h     |   3 +
 .../aarch64/sve/acle/general/pr106326_1.c     | 378 ++++++++++++++++++
 3 files changed, 427 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
  

Patch

diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index b61156302cf..ee81282a0be 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -2561,6 +2561,17 @@  vector_cst_all_same (tree v, unsigned int step)
   return true;
 }
 
+/* Return true if V is a constant predicate that acts as a ptrue when
+   predicating STEP-byte elements.  */
+bool
+is_ptrue (tree v, unsigned int step)
+{
+  return (TREE_CODE (v) == VECTOR_CST
+	  && TYPE_MODE (TREE_TYPE (v)) == VNx16BImode
+	  && integer_nonzerop (VECTOR_CST_ENCODED_ELT (v, 0))
+	  && vector_cst_all_same (v, step));
+}
+
 gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
 			      gimple_stmt_iterator *gsi_in, gcall *call_in)
   : function_call_info (gimple_location (call_in), instance, fndecl),
@@ -2635,6 +2646,37 @@  gimple_folder::redirect_call (const function_instance &instance)
   return call;
 }
 
+/* Redirect _z and _m calls to _x functions if the predicate is all-true.
+   This allows us to use unpredicated instructions, where available.  */
+gimple *
+gimple_folder::redirect_pred_x ()
+{
+  if (pred != PRED_z && pred != PRED_m)
+    return nullptr;
+
+  if (gimple_call_num_args (call) < 2)
+    return nullptr;
+
+  tree lhs_type = TREE_TYPE (TREE_TYPE (fndecl));
+  tree arg0_type = type_argument_type (TREE_TYPE (fndecl), 1);
+  tree arg1_type = type_argument_type (TREE_TYPE (fndecl), 2);
+  if (!VECTOR_TYPE_P (lhs_type)
+      || !VECTOR_TYPE_P (arg0_type)
+      || !VECTOR_TYPE_P (arg1_type))
+    return nullptr;
+
+  auto lhs_step = element_precision (lhs_type);
+  auto rhs_step = element_precision (arg1_type);
+  auto step = MAX (lhs_step, rhs_step);
+  if (!multiple_p (step, BITS_PER_UNIT)
+      || !is_ptrue (gimple_call_arg (call, 0), step / BITS_PER_UNIT))
+    return nullptr;
+
+  function_instance instance (*this);
+  instance.pred = PRED_x;
+  return redirect_call (instance);
+}
+
 /* Fold the call to constant VAL.  */
 gimple *
 gimple_folder::fold_to_cstu (poly_uint64 val)
@@ -2707,6 +2749,10 @@  gimple_folder::fold ()
   if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node)
     return NULL;
 
+  /* First try some simplifications that are common to many functions.  */
+  if (auto *call = redirect_pred_x ())
+    return call;
+
   return base->fold (*this);
 }
 
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
index d646df1c026..b9148c51b28 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -500,6 +500,8 @@  public:
   tree load_store_cookie (tree);
 
   gimple *redirect_call (const function_instance &);
+  gimple *redirect_pred_x ();
+
   gimple *fold_to_cstu (poly_uint64);
   gimple *fold_to_pfalse ();
   gimple *fold_to_ptrue ();
@@ -673,6 +675,7 @@  extern tree acle_svpattern;
 extern tree acle_svprfop;
 
 bool vector_cst_all_same (tree, unsigned int);
+bool is_ptrue (tree, unsigned int);
 
 /* Return the ACLE type svbool_t.  */
 inline tree
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
new file mode 100644
index 00000000000..34604a8df6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
@@ -0,0 +1,378 @@ 
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** add1:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add1 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b8 (), x, y);
+}
+
+/*
+** add2:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add2 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b16 (), x, y);
+}
+
+/*
+** add3:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add3 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b32 (), x, y);
+}
+
+/*
+** add4:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svint32_t
+add4 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b64 (), x, y);
+}
+
+/*
+** add5:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add5 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b8 (), x, y);
+}
+
+/*
+** add6:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add6 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b16 (), x, y);
+}
+
+/*
+** add7:
+**	add	z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**	ret
+*/
+svint32_t
+add7 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** add8:
+**	ptrue	(p[0-7])\.d(?:, all)?
+**	add	z0\.s, \1/m, z0\.s, z1\.s
+**	ret
+*/
+svint32_t
+add8 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b64 (), x, y);
+}
+
+/*
+** add9:
+**	ptrue	(p[0-7])\.s(?:, all)?
+**	add	z0\.h, \1/m, z0\.h, z1\.h
+**	ret
+*/
+svint16_t
+add9 (svint16_t x, svint16_t y)
+{
+  return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** and1:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and1 (svint32_t x)
+{
+  return svand_z (svptrue_b8 (), x, 1);
+}
+
+/*
+** and2:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and2 (svint32_t x)
+{
+  return svand_z (svptrue_b16 (), x, 1);
+}
+
+/*
+** and3:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and3 (svint32_t x)
+{
+  return svand_z (svptrue_b32 (), x, 1);
+}
+
+/*
+** and4:
+**	(?!and	z0\.s, z0\.s, #).*
+**	ret
+*/
+svint32_t
+and4 (svint32_t x)
+{
+  return svand_z (svptrue_b64 (), x, 1);
+}
+
+/*
+** and5:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and5 (svint32_t x)
+{
+  return svand_m (svptrue_b8 (), x, 1);
+}
+
+/*
+** and6:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and6 (svint32_t x)
+{
+  return svand_m (svptrue_b16 (), x, 1);
+}
+
+/*
+** and7:
+**	and	z0\.s, z0\.s, #(?:0x)?1
+**	ret
+*/
+svint32_t
+and7 (svint32_t x)
+{
+  return svand_m (svptrue_b32 (), x, 1);
+}
+
+/*
+** and8:
+**	(?!and	z0\.s, z0\.s, #).*
+**	ret
+*/
+svint32_t
+and8 (svint32_t x)
+{
+  return svand_m (svptrue_b64 (), x, 1);
+}
+
+/*
+** and9:
+** (
+**	and	p0\.b, p0/z, p1\.b, p1\.b
+** |
+**	and	p0\.b, p1/z, p0\.b, p0\.b
+** )
+**	ret
+*/
+svbool_t
+and9 (svbool_t x, svbool_t y)
+{
+  return svand_z (svptrue_b8 (), x, y);
+}
+
+/*
+** not1:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	not	z0\.s, \1/m, z1\.s
+**	ret
+*/
+svint32_t
+not1 (svint32_t x, svint32_t y)
+{
+  return svnot_m (x, svptrue_b8 (), y);
+}
+
+/*
+** cvt1:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvtzs	z0\.s, \1/m, z0\.h
+**	ret
+*/
+svint32_t
+cvt1 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt2:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvtzs	z0\.s, \1/m, z0\.h
+**	ret
+*/
+svint32_t
+cvt2 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt3:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvtzs	z0\.s, \1/m, z0\.h
+**	ret
+*/
+svint32_t
+cvt3 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt4:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svint32_t
+cvt4 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt5:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvt	z0\.h, \1/m, z0\.s
+**	ret
+*/
+svfloat16_t
+cvt5 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt6:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvt	z0\.h, \1/m, z0\.s
+**	ret
+*/
+svfloat16_t
+cvt6 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt7:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	fcvt	z0\.h, \1/m, z0\.s
+**	ret
+*/
+svfloat16_t
+cvt7 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt8:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svfloat16_t
+cvt8 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt9:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	scvtf	z0\.h, \1/m, z0\.h
+**	ret
+*/
+svfloat16_t
+cvt9 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt10:
+**	ptrue	(p[0-7])\.b(?:, all)?
+**	scvtf	z0\.h, \1/m, z0\.h
+**	ret
+*/
+svfloat16_t
+cvt10 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt11:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svfloat16_t
+cvt11 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt12:
+**	...
+**	movprfx	[^\n]+
+**	...
+**	ret
+*/
+svfloat16_t
+cvt12 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+#ifdef __cplusplus
+}
+#endif