From 94d34da8de2fd479c81e8398544466e6ffe7fdfc Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Wed, 22 May 2024 17:08:32 +0800
Subject: [PATCH 5/5] vect: Add accumulating-result pattern for lane-reducing
operation
This patch adds a pattern to fold a summation into the last operand of lane-
reducing operation when appropriate, which is a supplement to those operation-
specific patterns for dot-prod/sad/widen-sum.
sum = lane-reducing-op(..., 0) + value;
=>
sum = lane-reducing-op(..., value);
2024-05-22 Feng Xue <fxue@os.amperecomputing.com>
gcc/
* tree-vect-patterns (vect_recog_lane_reducing_accum_pattern): New
pattern function.
(vect_vect_recog_func_ptrs): Add the new pattern function.
* params.opt (vect-lane-reducing-accum-pattern): New parameter.
gcc/testsuite/
* gcc.dg/vect/vect-reduc-accum-pattern.c
---
gcc/params.opt | 4 +
.../gcc.dg/vect/vect-reduc-accum-pattern.c | 61 ++++++++++
gcc/tree-vect-patterns.cc | 106 ++++++++++++++++++
3 files changed, 171 insertions(+)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-accum-pattern.c
@@ -1198,6 +1198,10 @@ The maximum factor which the loop vectorizer applies to the cost of statements i
Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization
Enable loop vectorization of floating point inductions.
+-param=vect-lane-reducing-accum-pattern=
+Common Joined UInteger Var(param_vect_lane_reducing_accum_pattern) Init(2) IntegerRange(0, 2) Param Optimization
+Allow pattern of combining plus into lane reducing operation or not. If value is 2, allow this for all statements, or if 1, only for reduction statement, otherwise, disable it.
+
-param=vrp-block-limit=
Common Joined UInteger Var(param_vrp_block_limit) Init(150000) Optimization Param
Maximum number of basic blocks before VRP switches to a fast model with less memory requirements.
new file mode 100644
@@ -0,0 +1,61 @@
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#define FN(name, S1, S2) \
+S1 int __attribute__ ((noipa)) \
+name (S1 int res, \
+ S2 char *restrict a, \
+ S2 char *restrict b, \
+ S2 char *restrict c, \
+ S2 char *restrict d) \
+{ \
+ for (int i = 0; i < N; i++) \
+ res += a[i] * b[i]; \
+ \
+ asm volatile ("" ::: "memory"); \
+ for (int i = 0; i < N; ++i) \
+ res += (a[i] * b[i] + c[i] * d[i]) << 3; \
+ \
+ return res; \
+}
+
+FN(f1_vec, signed, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec, signed, signed)
+#pragma GCC pop_options
+
+#define BASE2 ((signed int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ signed char a[N], b[N];
+ signed char c[N], d[N];
+
+#pragma GCC novector
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE2 + i * 5;
+ b[i] = BASE2 + OFFSET + i * 4;
+ c[i] = BASE2 + i * 6;
+ d[i] = BASE2 + OFFSET + i * 5;
+ }
+
+ if (f1_vec (0x12345, a, b, c, d) != f1_novec (0x12345, a, b, c, d))
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vect_recog_lane_reducing_accum_pattern: detected" "vect" { target { vect_sdot_qi } } } } */
@@ -1490,6 +1490,111 @@ vect_recog_abd_pattern (vec_info *vinfo,
return vect_convert_output (vinfo, stmt_vinfo, out_type, stmt, vectype_out);
}
+/* Function vect_recog_lane_reducing_accum_pattern
+
+ Try to fold a summation into the last operand of lane-reducing operation.
+
+ sum = lane-reducing-op(..., 0) + value;
+
+ A lane-reducing operation contains two aspects: main primitive operation
+ and appendant result-accumulation. Pattern matching for the basic aspect
+ is handled in specific pattern for dot-prod/sad/widen-sum respectively.
+ The function is in charge of the other aspect.
+
+ Input:
+
+ * STMT_VINFO: The stmt from which the pattern search begins.
+
+ Output:
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the sequence of
+ stmts that constitute the pattern, that is:
+ sum = lane-reducing-op(..., value);
+*/
+
+static gimple *
+vect_recog_lane_reducing_accum_pattern (vec_info *vinfo,
+ stmt_vec_info stmt_vinfo,
+ tree *type_out)
+{
+ if (!(stmt_vinfo->reduc_pattern_status & rpatt_formed))
+ return NULL;
+
+ if (param_vect_lane_reducing_accum_pattern == 0)
+ return NULL;
+
+ if (param_vect_lane_reducing_accum_pattern == 1)
+ {
+ /* Only allow combing for loop reduction statement. */
+ if (STMT_VINFO_REDUC_IDX (stmt_vinfo) < 0)
+ return NULL;
+ }
+
+ gimple *last_stmt = stmt_vinfo->stmt;
+
+ if (!is_gimple_assign (last_stmt)
+ || gimple_assign_rhs_code (last_stmt) != PLUS_EXPR)
+ return NULL;
+
+ gimple *lane_reducing_stmt = NULL;
+ tree sum_oprnd = NULL_TREE;
+
+ for (unsigned i = 0; i < 2; i++)
+ {
+ tree oprnd = gimple_op (last_stmt, i + 1);
+ vect_unpromoted_value unprom;
+ bool single_use_p = true;
+
+ if (!vect_look_through_possible_promotion (vinfo, oprnd, &unprom,
+ &single_use_p)
+ || !single_use_p)
+ continue;
+
+ stmt_vec_info oprnd_vinfo = vect_get_internal_def (vinfo, unprom.op);
+
+ if (!oprnd_vinfo)
+ continue;
+
+ gimple *stmt = oprnd_vinfo->stmt;
+
+ if (lane_reducing_stmt_p (stmt)
+ && integer_zerop (gimple_op (stmt, gimple_num_ops (stmt) - 1)))
+ {
+ lane_reducing_stmt = stmt;
+ sum_oprnd = gimple_op (last_stmt, 2 - i);
+ break;
+ }
+ }
+
+ if (!lane_reducing_stmt)
+ return NULL;
+
+ tree type = TREE_TYPE (gimple_get_lhs (last_stmt));
+
+ *type_out = get_vectype_for_scalar_type (vinfo, type);
+ if (!*type_out)
+ return NULL;
+
+ vect_pattern_detected ("vect_recog_lane_reducing_accum_pattern", last_stmt);
+
+ tree var = vect_recog_temp_ssa_var (type, NULL);
+ enum tree_code code = gimple_assign_rhs_code (lane_reducing_stmt);
+ gimple *pattern_stmt;
+
+ if (code == WIDEN_SUM_EXPR)
+ pattern_stmt = gimple_build_assign (var, code,
+ gimple_op (lane_reducing_stmt, 1),
+ sum_oprnd);
+ else
+ pattern_stmt = gimple_build_assign (var, code,
+ gimple_op (lane_reducing_stmt, 1),
+ gimple_op (lane_reducing_stmt, 2),
+ sum_oprnd);
+ return pattern_stmt;
+}
+
/* Recognize an operation that performs ORIG_CODE on widened inputs,
so that it can be treated as though it had the form:
@@ -7084,6 +7189,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
{ vect_recog_dot_prod_pattern, "dot_prod" },
{ vect_recog_sad_pattern, "sad" },
{ vect_recog_widen_sum_pattern, "widen_sum" },
+ { vect_recog_lane_reducing_accum_pattern, "lane_reducing_accum" },
{ vect_recog_bitfield_ref_pattern, "bitfield_ref" },
{ vect_recog_bit_insert_pattern, "bit_insert" },
--
2.17.1