tree-optimization/116352 - SLP scheduling and stmt order

Message ID 20241202130558.6D1ED385842C@sourceware.org
State Committed
Commit 5ab3f091b3eb42795340d3c9cea8aaec2060693c
Headers
Series tree-optimization/116352 - SLP scheduling and stmt order |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 fail Patch failed to apply

Commit Message

Richard Biener Dec. 2, 2024, 1:04 p.m. UTC
  The PR uncovers unchecked constraints on the ability to code-generate
with SLP but also latent issues with regard to stmt order checking
since loop (early-break) and BB (for quite some time) vectorization
are no longer constraint to single-BBs.  In particular get_later_stmt
simply compares UIDs of stmts, but that's only reliable when they
are in the same BB.

For the PR in question the problematical case is demoting a SLP node
to external which fails to check we can actually code generate this
in the way we do (using get_later_stmt).  The following thus adds
checking that we demote to external only when all defs are from
the same BB.

We no longer vectorize gcc.dg/vect/bb-slp-49.c but the testcase was
for a wrong-code issue and the vectorization done is a no-op.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

	PR tree-optimization/116352
	PR tree-optimization/117876
	* tree-vect-slp.cc (vect_slp_can_convert_to_external): New.
	(vect_slp_convert_to_external): Call it.
	(vect_build_slp_tree_2): Likewise.

	* gcc.dg/vect/pr116352.c: New testcase.
	* gcc.dg/vect/bb-slp-49.c: Remove vectorization check.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-49.c |  3 +--
 gcc/testsuite/gcc.dg/vect/pr116352.c  | 34 +++++++++++++++++++++++++++
 gcc/tree-vect-slp.cc                  | 29 ++++++++++++++++++-----
 3 files changed, 58 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr116352.c
  

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c
index e7101fcff46..c0ad5d70a9a 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c
@@ -23,6 +23,5 @@  main ()
   return 0;
 }
 
-/* See that we vectorize an SLP instance.  */
+/* See that we try to vectorize an SLP instance.  */
 /* { dg-final { scan-tree-dump "Analyzing vectorizable constructor" "slp1" } } */
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr116352.c b/gcc/testsuite/gcc.dg/vect/pr116352.c
new file mode 100644
index 00000000000..3fe537c34ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr116352.c
@@ -0,0 +1,34 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+static void addPrior(float center_x, float center_y, float width, float height,
+                     bool normalized, float *dst)
+{
+  if (normalized)
+    {
+      dst[0] = (center_x - width * 0.5f);
+      dst[1] = (center_y - height * 0.5f);
+      dst[2] = (center_x + width * 0.5f);
+      dst[3] = (center_y + height * 0.5f);
+    }
+  else
+    {
+      dst[0] = center_x - width * 0.5f;
+      dst[1] = center_y - height * 0.5f;
+      dst[2] = center_x + width * 0.5f - 1.0f;
+      dst[3] = center_y + height * 0.5f - 1.0f;
+    }
+}
+void forward(float *outputPtr, int _offsetsXs, float *_offsetsX,
+	     float *_offsetsY, float _stepX, float _stepY,
+	     bool _bboxesNormalized, float _boxWidth, float _boxHeight)
+{
+  for (int j = 0; j < _offsetsXs; ++j)
+    {
+      float center_x = (_offsetsX[j]) * _stepX;
+      float center_y = (_offsetsY[j]) * _stepY;
+      addPrior(center_x, center_y, _boxWidth, _boxHeight, _bboxesNormalized,
+	       outputPtr);
+      outputPtr += 4;
+    }
+}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index ec986cc3f68..1799d5a619b 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -67,6 +67,7 @@  static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
 					  slp_tree, stmt_vector_for_cost *);
 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
+static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
 
 static object_allocator<_slp_tree> *slp_tree_pool;
 static slp_tree slp_first_node;
@@ -2887,7 +2888,8 @@  fail:
 	  for (j = 0; j < group_size; ++j)
 	    if (!matches[j])
 	      break;
-	  if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
+	  if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
+	      && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location,
@@ -7764,6 +7766,24 @@  vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
 			    node, node_instance, cost_vec);
 }
 
+/* Verify if we can externalize a set of internal defs.  */
+
+static bool
+vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
+{
+  basic_block bb = NULL;
+  for (stmt_vec_info stmt : stmts)
+    if (!stmt)
+      return false;
+    /* Constant generation uses get_later_stmt which can only handle
+       defs from the same BB.  */
+    else if (!bb)
+      bb = gimple_bb (stmt->stmt);
+    else if (gimple_bb (stmt->stmt) != bb)
+      return false;
+  return true;
+}
+
 /* Try to build NODE from scalars, returning true on success.
    NODE_INSTANCE is the SLP instance that contains NODE.  */
 
@@ -7779,13 +7799,10 @@  vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
       || !SLP_TREE_SCALAR_STMTS (node).exists ()
       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
       /* Force the mask use to be built from scalars instead.  */
-      || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
+      || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
+      || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
     return false;
 
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
-    if (!stmt_info)
-      return false;
-
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "Building vector operands of %p from scalars instead\n",