Fall back to single-lane SLP before falling back to no SLP

Message ID 20240918112302.2D52013508@imap1.dmz-prg2.suse.org
State New
Headers
Series Fall back to single-lane SLP before falling back to no SLP |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-lint warning Lint failed
rivoscibot/toolchain-ci-rivos-apply-patch success Patch applied
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gc-lp64d-non-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc-lp64d-non-multilib success Build passed
rivoscibot/toolchain-ci-rivos-test success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed

Commit Message

Richard Biener Sept. 18, 2024, 11:22 a.m. UTC
  The following changes the fallback to disable SLP when any of the
discovered SLP instances failed to pass vectorization checking into
a fallback that emulates what no SLP would do with SLP - force
single-lane discovery for all instances.

The patch does not remove the final fallback to disable SLP but it
reduces the fallout from failing vectorization when any non-SLP
stmt survives analysis.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

I'm watching CI results but do not really expect to see big improvements
besides of when forcing SLP as with the recently posted
"Testing: fail vectorization when not using SLP" patch.

	* tree-vectorizer.h (vect_analyze_slp): Add force_single_lane
	parameter.
	* tree-vect-slp.cc (vect_analyze_slp_instance): Remove
	defaulting of force_single_lane.
	(vect_build_slp_instance): Likewise.  Pass down appropriate
	force_single_lane.
	(vect_analyze_slp): Add force_sigle_lane parameter and pass
	it down appropriately.
	(vect_slp_analyze_bb_1): Always do multi-lane SLP.
	* tree-vect-loop.cc (vect_analyze_loop_2): Track two SLP
	modes and adjust accordingly.
	(vect_analyze_loop_1): Save the SLP mode when unrolling.
---
 gcc/tree-vect-loop.cc | 25 +++++++++++++------------
 gcc/tree-vect-slp.cc  | 43 +++++++++++++++++++++++++------------------
 gcc/tree-vectorizer.h |  2 +-
 3 files changed, 39 insertions(+), 31 deletions(-)
  

Patch

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 199d79029e4..8bf231e98ec 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2718,7 +2718,7 @@  vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
 static opt_result
 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
 		     unsigned *suggested_unroll_factor,
-		     bool& slp_done_for_suggested_uf)
+		     unsigned& slp_done_for_suggested_uf)
 {
   opt_result ok = opt_result::success ();
   int res;
@@ -2787,11 +2787,11 @@  vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
   /* If the slp decision is false when suggested unroll factor is worked
      out, and we are applying suggested unroll factor, we can simply skip
      all slp related analyses this time.  */
-  bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
+  unsigned slp = !applying_suggested_uf ? 2 : slp_done_for_suggested_uf;
 
   /* Classify all cross-iteration scalar data-flow cycles.
      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
-  vect_analyze_scalar_cycles (loop_vinfo, slp);
+  vect_analyze_scalar_cycles (loop_vinfo, slp == 2);
 
   vect_pattern_recog (loop_vinfo);
 
@@ -2859,7 +2859,8 @@  vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
     {
       /* Check the SLP opportunities in the loop, analyze and build
 	 SLP trees.  */
-      ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
+      ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo),
+			     slp == 1);
       if (!ok)
 	return ok;
 
@@ -3212,15 +3213,14 @@  again:
   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
   gcc_assert (!ok);
 
-  /* Try again with SLP forced off but if we didn't do any SLP there is
+  /* Try again with SLP degraded but if we didn't do any SLP there is
      no point in re-trying.  */
   if (!slp)
     return ok;
 
-  /* If the slp decision is true when suggested unroll factor is worked
-     out, and we are applying suggested unroll factor, we don't need to
-     re-try any more.  */
-  if (applying_suggested_uf && slp_done_for_suggested_uf)
+  /* If we are applying suggested unroll factor, we don't need to
+     re-try any more as we want to keep the SLP mode fixed.  */
+  if (applying_suggested_uf)
     return ok;
 
   /* If there are reduction chains re-trying will fail anyway.  */
@@ -3268,8 +3268,9 @@  again:
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "re-trying with SLP disabled\n");
 
-  /* Roll back state appropriately.  No SLP this time.  */
-  slp = false;
+  /* Roll back state appropriately.  Degrade SLP this time.  From multi-
+     to single-lane to disabled.  */
+  --slp;
   /* Restore vectorization factor as it were without SLP.  */
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
   /* Free the SLP instances.  */
@@ -3414,7 +3415,7 @@  vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
   machine_mode vector_mode = vector_modes[mode_i];
   loop_vinfo->vector_mode = vector_mode;
   unsigned int suggested_unroll_factor = 1;
-  bool slp_done_for_suggested_uf = false;
+  unsigned slp_done_for_suggested_uf = 0;
 
   /* Run the main analysis.  */
   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 4fcb9e2fa2b..c8af4d320eb 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3488,7 +3488,7 @@  vect_analyze_slp_instance (vec_info *vinfo,
 			   scalar_stmts_to_slp_tree_map_t *bst_map,
 			   stmt_vec_info stmt_info, slp_instance_kind kind,
 			   unsigned max_tree_size, unsigned *limit,
-			   bool force_single_lane = false);
+			   bool force_single_lane);
 
 /* Build an interleaving scheme for the store sources RHS_NODES from
    SCALAR_STMTS.  */
@@ -3684,7 +3684,7 @@  vect_build_slp_instance (vec_info *vinfo,
 			 scalar_stmts_to_slp_tree_map_t *bst_map,
 			 /* ???  We need stmt_info for group splitting.  */
 			 stmt_vec_info stmt_info_,
-			 bool force_single_lane = false)
+			 bool force_single_lane)
 {
   /* If there's no budget left bail out early.  */
   if (*limit == 0)
@@ -3891,7 +3891,7 @@  vect_build_slp_instance (vec_info *vinfo,
 							       group1_size);
 	      bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
 						    kind, max_tree_size,
-						    limit);
+						    limit, false);
 	      /* Split the rest at the failure point and possibly
 		 re-analyze the remaining matching part if it has
 		 at least two lanes.  */
@@ -3904,14 +3904,14 @@  vect_build_slp_instance (vec_info *vinfo,
 		  if (i - group1_size > 1)
 		    res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
 						      kind, max_tree_size,
-						      limit);
+						      limit, false);
 		}
 	      /* Re-analyze the non-matching tail if it has at least
 		 two lanes.  */
 	      if (i + 1 < group_size)
 		res |= vect_analyze_slp_instance (vinfo, bst_map,
 						  rest, kind, max_tree_size,
-						  limit);
+						  limit, false);
 	      return res;
 	    }
 	}
@@ -4544,7 +4544,8 @@  vect_lower_load_permutations (loop_vec_info loop_vinfo,
    trees of packed scalar stmts if SLP is possible.  */
 
 opt_result
-vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
+vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
+		  bool force_single_lane)
 {
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   unsigned int i;
@@ -4561,7 +4562,8 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
   /* Find SLP sequences starting from groups of grouped stores.  */
   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
     vect_analyze_slp_instance (vinfo, bst_map, first_element,
-			       slp_inst_kind_store, max_tree_size, &limit);
+			       slp_inst_kind_store, max_tree_size, &limit,
+			       force_single_lane);
 
   /* For loops also start SLP discovery from non-grouped stores.  */
   if (loop_vinfo)
@@ -4581,7 +4583,7 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	    stmts.quick_push (stmt_info);
 	    vect_build_slp_instance (vinfo, slp_inst_kind_store,
 				     stmts, roots, remain, max_tree_size,
-				     &limit, bst_map, NULL);
+				     &limit, bst_map, NULL, force_single_lane);
 	  }
     }
 
@@ -4598,7 +4600,8 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 				       bb_vinfo->roots[i].stmts,
 				       bb_vinfo->roots[i].roots,
 				       bb_vinfo->roots[i].remain,
-				       max_tree_size, &limit, bst_map, NULL))
+				       max_tree_size, &limit, bst_map, NULL,
+				       false))
 	    {
 	      bb_vinfo->roots[i].stmts = vNULL;
 	      bb_vinfo->roots[i].roots = vNULL;
@@ -4614,9 +4617,11 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	if (! STMT_VINFO_RELEVANT_P (first_element)
 	    && ! STMT_VINFO_LIVE_P (first_element))
 	  ;
-	else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
-					      slp_inst_kind_reduc_chain,
-					      max_tree_size, &limit))
+	else if (force_single_lane
+		 || ! vect_analyze_slp_instance (vinfo, bst_map, first_element,
+						 slp_inst_kind_reduc_chain,
+						 max_tree_size, &limit,
+						 force_single_lane))
 	  {
 	    /* Dissolve reduction chain group.  */
 	    stmt_vec_info vinfo = first_element;
@@ -4656,7 +4661,8 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 		{
 		  /* Do not discover SLP reductions combining lane-reducing
 		     ops, that will fail later.  */
-		  if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
+		  if (!force_single_lane
+		      && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
 		    scalar_stmts.quick_push (next_info);
 		  else
 		    {
@@ -4670,7 +4676,8 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 					       slp_inst_kind_reduc_group,
 					       stmts, roots, remain,
 					       max_tree_size, &limit,
-					       bst_map, NULL);
+					       bst_map, NULL,
+					       force_single_lane);
 		    }
 		}
 	    }
@@ -4683,7 +4690,7 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 					   slp_inst_kind_reduc_group,
 					   scalar_stmts, roots, remain,
 					   max_tree_size, &limit, bst_map,
-					   NULL))
+					   NULL, force_single_lane))
 	    {
 	      if (scalar_stmts.length () <= 1)
 		scalar_stmts.release ();
@@ -4699,7 +4706,7 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 					   slp_inst_kind_reduc_group,
 					   stmts, roots, remain,
 					   max_tree_size, &limit,
-					   bst_map, NULL);
+					   bst_map, NULL, force_single_lane);
 		}
 	      saved_stmts.release ();
 	    }
@@ -4731,7 +4738,7 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 					 slp_inst_kind_reduc_group,
 					 stmts, roots, remain,
 					 max_tree_size, &limit,
-					 bst_map, NULL);
+					 bst_map, NULL, force_single_lane);
 	      }
 	  }
     }
@@ -8934,7 +8941,7 @@  vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
   /* Check the SLP opportunities in the basic block, analyze and build SLP
      trees.  */
-  if (!vect_analyze_slp (bb_vinfo, n_stmts))
+  if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
     {
       if (dump_enabled_p ())
 	{
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 699ae9e33ba..53105f9292f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2538,7 +2538,7 @@  extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec<tree>
 					  unsigned * = nullptr, bool = false);
 extern bool vect_slp_analyze_operations (vec_info *);
 extern void vect_schedule_slp (vec_info *, const vec<slp_instance> &);
-extern opt_result vect_analyze_slp (vec_info *, unsigned);
+extern opt_result vect_analyze_slp (vec_info *, unsigned, bool);
 extern bool vect_make_slp_decision (loop_vec_info);
 extern void vect_detect_hybrid_slp (loop_vec_info);
 extern void vect_optimize_slp (vec_info *);