@@ -88,6 +88,7 @@ along with GCC; see the file COPYING3. If not see
#include "ipa-prop.h"
#include "ipa-fnsummary.h"
#include "sched-int.h"
+#include "tree-vectorizer.h"
/* This file should be included last. */
#include "target-def.h"
@@ -4199,6 +4200,130 @@ s390_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
+/* s390-specific vector costs */
+class s390_vector_costs : public vector_costs
+{
+ stmt_vec_info skipfinalpart;
+public:
+ s390_vector_costs (vec_info *, bool);
+
+ unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info, slp_tree node,
+ tree vectype, int misalign,
+ vect_cost_model_location where) override;
+};
+
+s390_vector_costs::s390_vector_costs(vec_info *vinfo, bool costing_for_scalar)
+ : vector_costs(vinfo, costing_for_scalar)
+{
+}
+
+unsigned int
+s390_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info, slp_tree node,
+ tree vectype, int misalign,
+ vect_cost_model_location where)
+{
+ bool fp = false;
+ int costs = s390_builtin_vectorization_cost (kind, vectype, misalign);
+
+ if (vectype != NULL)
+ fp = FLOAT_TYPE_P (vectype);
+
+ if ((kind == scalar_to_vec || kind == vec_construct)
+ && node
+ && SLP_TREE_DEF_TYPE (node) == vect_external_def)
+ {
+ unsigned int i;
+ tree op;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+ if (TREE_CODE (op) == SSA_NAME)
+ TREE_VISITED (op) = 0;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+ {
+ if (TREE_CODE (op) != SSA_NAME
+ || TREE_VISITED (op))
+ continue;
+ TREE_VISITED (op) = 1;
+ gimple *def = SSA_NAME_DEF_STMT (op);
+ tree temp;
+ if (is_gimple_assign(def)
+ && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))
+ && (temp = gimple_assign_rhs1(def))
+ && TREE_CODE (temp) == SSA_NAME
+ && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)),
+ TREE_TYPE (temp)))
+ def = SSA_NAME_DEF_STMT (temp);
+ if (!gimple_assign_load_p (def))
+ {
+ /* For scalar_to_vec from a fp register, we might not
+ cross the register files. So keep the penalty small.
+ ??? If we have to cross, we actually cross twice
+ leading to a huge runtime penalty. Should we reflect
+ this here? */
+ if (kind == scalar_to_vec && fp)
+ costs += 2;
+ else
+ costs += 3;
+ }
+ }
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+ if (TREE_CODE (op) == SSA_NAME)
+ TREE_VISITED (op) = 0;
+ }
+ if (kind == scalar_stmt && stmt_info && is_gimple_assign (stmt_info->stmt))
+ {
+ const gassign *assign = dyn_cast<const gassign*> (stmt_info->stmt);
+ tree comptype = NULL_TREE;
+ if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR)
+ comptype = TREE_TYPE (gimple_assign_rhs1 (assign));
+ if (gimple_assign_rhs_code (assign) == BIT_FIELD_REF)
+ comptype = TREE_TYPE (TREE_OPERAND (gimple_assign_rhs1 (assign), 0));
+ if (comptype != NULL_TREE && VECTOR_TYPE_P (comptype))
+ {
+ /* This will be a vlvg or vlgv that crosses the register files. */
+ costs += 3;
+ }
+ }
+ if (stmt_info
+ && (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type
+ || STMT_VINFO_TYPE (stmt_info) == load_vec_info_type))
+ {
+ if (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE)
+ {
+ /* gimple represents elementwise unloading as two steps
+ (vec_to_scalar followed by scalar_store). s390 stores
+ lanes to memory in one operation. Similarly, elementwise
+ loading is represented as scalar_load for each lane
+ followed by a vec_construct. s390 loads directly in the
+ appropriate lanes. The second operation does not
+ exist. */
+ if (kind == scalar_to_vec || kind == scalar_load)
+ skipfinalpart = stmt_info;
+ if ((kind == scalar_store || kind == vec_construct)
+ && skipfinalpart == stmt_info)
+ return 0;
+ }
+ else if (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_CONTIGUOUS_REVERSE)
+ {
+ /* gimple represents reversal via a vec_perm followed by the
+ load/store. s390 has vector load/store reversed
+ instructions. The permute operation does not exist. */
+ if (kind == vec_perm)
+ return 0;
+ }
+ }
+ costs *= count;
+ return record_stmt_cost (stmt_info, where, (unsigned int) costs);
+}
+
+/* Implement targetm.vectorize.create_costs. */
+static vector_costs *
+s390_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
+{
+ return new s390_vector_costs(vinfo, costing_for_scalar);
+}
+
/* If OP is a SYMBOL_REF of a thread-local symbol, return its TLS mode,
otherwise return 0. */
@@ -18088,6 +18213,8 @@ s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
s390_builtin_vectorization_cost
+#undef TARGET_VECTORIZE_CREATE_COSTS
+#define TARGET_VECTORIZE_CREATE_COSTS s390_vectorize_create_costs
#undef TARGET_MACHINE_DEPENDENT_REORG
#define TARGET_MACHINE_DEPENDENT_REORG s390_reorg
new file mode 100644
@@ -0,0 +1,82 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -march=z13" } */
+
+#define N 32
+
+void contiguous
+(int *restrict out, int *restrict in, int m)
+{
+ int i;
+
+ for (i = 0; i < N; ++i)
+ out[i] = in[i] * m;
+}
+
+void contiguous_permute__load
+(int *restrict out, int *restrict in, int m)
+{
+ int i;
+
+ for (i = 0; i < N; ++i)
+ out[i] = in[2 * i] * m;
+}
+
+void contiguous_permute__store
+(int *restrict out, int *restrict in, int m)
+{
+ int i;
+
+ for (i = 0; i < N; ++i)
+ out[2 * i] = in[i] * m;
+}
+
+void elementwise
+(int *restrict out, int *restrict in, int m, int s)
+{
+ int i;
+
+ for (i = 0; i < N; ++i)
+ out[i] = in[s * i] * m;
+}
+
+void contiguous_reverse
+(int *restrict out, int *restrict in, int m)
+{
+ int i;
+
+ for (i = N - 1; i >= 0; --i)
+ out[i] = in[i] * m;
+}
+
+#if 0
+/* This does not work currently.
+ => "not falling back to elementwise accesses" */
+void contiguous_permute__load_reversed
+(int *restrict out, int *restrict in, int m)
+{
+ int i;
+
+ for (i = N - 1; i >= 0; --i)
+ out[i] = in[2 * i] * m;
+}
+#endif
+
+void contiguous_permute__store_reversed
+(int *restrict out, int *restrict in, int m)
+{
+ int i;
+
+ for (i = N - 1; i >= 0; --i)
+ out[2 * i] = in[i] * m;
+}
+
+void elementwise__reversed
+(int *restrict out, int *restrict in, int m, int s)
+{
+ int i;
+
+ for (i = N - 1; i >= 0; --i)
+ out[i] = in[s * i] * m;
+}
+
+/* { dg-final { scan-tree-dump-not "couldn't vectorize loop" "vect" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-slp-all -march=z15" } */
+
+void vrep
+(int *x)
+{
+ x[0] = 42;
+ x[1] = 42;
+ x[2] = 42;
+ x[3] = 42;
+}
+
+void vgbm
+(int *x)
+{
+ x[0] = 0xff00;
+ x[1] = 0xff00;
+ x[2] = 0xff00;
+ x[3] = 0xff00;
+}
+
+void vgm
+(int *x)
+{
+ x[0] = 0x7e;
+ x[1] = 0x7e;
+ x[2] = 0x7e;
+ x[3] = 0x7e;
+}
+
+void vl
+(int *x)
+{
+ x[0] = 42;
+ x[1] = 0xff00;
+ x[2] = 0x7e;
+ x[3] = 0;
+}
+
+void vl_vst
+(int *restrict o, int *restrict i)
+{
+ o[0] = i[0];
+ o[1] = i[1];
+ o[2] = i[2];
+ o[3] = i[3];
+}
+
+void vlrepf
+(int *restrict o, int *restrict i)
+{
+ o[0] = i[0];
+ o[1] = i[0];
+ o[2] = i[0];
+ o[3] = i[0];
+}
+
+// Needs z15
+void vcefb
+(float *restrict o, int *restrict i)
+{
+ o[0] = i[0];
+ o[1] = i[1];
+ o[2] = i[2];
+ o[3] = i[3];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 7 "slp2" } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-slp-all -march=z13" } */
+
+void elementwise
+(int *o, int i0, int i1, int i2, unsigned int i3)
+{
+ o[0] = i0;
+ o[1] = i1;
+ o[2] = i2;
+ o[3] = i3;
+}
+
+void elementreplicate
+(int *o, int i)
+{
+ o[0] = i;
+ o[1] = i;
+ o[2] = i;
+ o[3] = i;
+}
+
+void mult
+(int *o, int i0, int i1, int i2, int i3, int m)
+{
+ o[0] = i0 * m;
+ o[1] = i1 * m;
+ o[2] = i2 * m;
+ o[3] = i3 * m;
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 3 "slp2" } } */