Do less redundant vect_transform_slp_perm_load calls

Message ID 20250912122024.8930B136DB@imap1.dmz-prg2.suse.org
State Committed
Commit 80f9440b5ecb1c3ab943b5c862f84e06fbeabd89
Headers
Series Do less redundant vect_transform_slp_perm_load calls |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-lint success Lint passed
rivoscibot/toolchain-ci-rivos-apply-patch success Patch applied
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-test success Testing passed

Commit Message

Richard Biener Sept. 12, 2025, 12:20 p.m. UTC
  The following tries to do vect_transform_slp_perm_load exactly
once during analysis and once during transform.  There's a 2nd
case left during analysis in get_load_store_type.  Temporarily
this records n_perms in the load-store info and verifies that
against the value computed at transform stage.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

	* tree-vectorizer.h (vect_load_store_data::n_perms): New.
	* tree-vect-stmts.cc (vectorizable_load): Analyze
	SLP_TREE_LOAD_PERMUTATION only once and remember n_perms.
	Verify the transform-time n_perms against the value stored
	during analysis.
---
 gcc/tree-vect-stmts.cc | 47 +++++++++++++++++++++++-------------------
 gcc/tree-vectorizer.h  |  1 +
 2 files changed, 27 insertions(+), 21 deletions(-)
  

Patch

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7eabf169a2b..d0ae19baebb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9478,6 +9478,7 @@  vectorizable_load (vec_info *vinfo,
 
   /* ???  The following checks should really be part of
      get_load_store_type.  */
+  unsigned n_perms = -1U;
   if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
       && !((memory_access_type == VMAT_ELEMENTWISE
 	    || mat_gather_scatter_p (memory_access_type))
@@ -9485,7 +9486,7 @@  vectorizable_load (vec_info *vinfo,
     {
       slp_perm = true;
 
-      if (!loop_vinfo)
+      if (!loop_vinfo && cost_vec)
 	{
 	  /* In BB vectorization we may not actually use a loaded vector
 	     accessing elements in excess of DR_GROUP_SIZE.  */
@@ -9508,17 +9509,21 @@  vectorizable_load (vec_info *vinfo,
 	    }
 	}
 
-      auto_vec<tree> tem;
-      unsigned n_perms;
-      if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
-					 true, &n_perms))
+      if (cost_vec)
 	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-			     vect_location,
-			     "unsupported load permutation\n");
-	  return false;
+	  if (!vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
+					     true, &n_perms))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				 vect_location,
+				 "unsupported load permutation\n");
+	      return false;
+	    }
+	  ls.n_perms = n_perms;
 	}
+      else
+	n_perms = ls.n_perms;
     }
 
   if (slp_node->ldst_lanes
@@ -9989,18 +9994,19 @@  vectorizable_load (vec_info *vinfo,
 	}
       if (slp_perm)
 	{
-	  unsigned n_perms;
 	  if (costing_p)
 	    {
-	      unsigned n_loads;
-	      vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
-					    true, &n_perms, &n_loads);
+	      gcc_assert (n_perms != -1U);
 	      inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
 					       slp_node, 0, vect_body);
 	    }
 	  else
-	    vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
-					  false, &n_perms);
+	    {
+	      unsigned n_perms2;
+	      vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
+					    false, &n_perms2);
+	      gcc_assert (n_perms == n_perms2);
+	    }
 	}
 
       if (costing_p)
@@ -11378,25 +11384,24 @@  vectorizable_load (vec_info *vinfo,
 
   if (slp_perm)
     {
-      unsigned n_perms;
       /* For SLP we know we've seen all possible uses of dr_chain so
 	 direct vect_transform_slp_perm_load to DCE the unused parts.
 	 ???  This is a hack to prevent compile-time issues as seen
 	 in PR101120 and friends.  */
       if (costing_p)
 	{
-	  vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf,
-					true, &n_perms, nullptr);
+	  gcc_assert (n_perms != -1U);
 	  if (n_perms != 0)
 	    inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
 					    slp_node, 0, vect_body);
 	}
       else
 	{
+	  unsigned n_perms2;
 	  bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
-						  gsi, vf, false, &n_perms,
+						  gsi, vf, false, &n_perms2,
 						  nullptr, true);
-	  gcc_assert (ok);
+	  gcc_assert (ok && n_perms == n_perms2);
 	}
       dr_chain.release ();
     }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 749a9830e07..6ac4299ede2 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -288,6 +288,7 @@  struct vect_load_store_data : vect_data {
   } gs;
   tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
   auto_vec<int> elsvals;
+  unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
 };
 
 /* A computation tree of an SLP instance.  Each node corresponds to a group of