tree-optimization/114413 - SLP CSE after permute optimization

Message ID 20240619122731.6A97E388463E@sourceware.org
State New
Headers
Series tree-optimization/114413 - SLP CSE after permute optimization |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed

Commit Message

Richard Biener June 19, 2024, 12:27 p.m. UTC
  We currently fail to re-CSE SLP nodes after optimizing permutes
which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
this shows in not re-using the SLP node with the load and arithmetic
for both the store and the reduction.  The following implements
CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.

I've tried to make the CSE part of permute materialization but it
isn't a very good fit there.  I've not bothered to implement something
more complete, also handling external defs or defs without
SLP_TREE_SCALAR_STMTS.

I realize this might result in more BB SLP which in turn might slow
down code given costing for BB SLP is difficult (even that we now
vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
This is nevertheless feeding more accurate info to costing which is
good.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Does this look OK?

Thanks,
Richard.

	PR tree-optimization/114413
	* tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
	New function, split out from ...
	(vect_analyze_slp): ... here.  Call it.
	(vect_cse_slp_nodes): New function.
	(vect_optimize_slp): Call it.

	* gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
 gcc/tree-vect-slp.cc                  | 77 ++++++++++++++++++++++-----
 2 files changed, 71 insertions(+), 12 deletions(-)
  

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
index 4f72727b694..475b241c36e 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
@@ -38,3 +38,9 @@  int main()
     abort ();
   return 0;
 }
+
+/* This is a weak test but we want to re-use the arithmetic for both the
+   store and the reduction.  */
+/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { x86_64-*-* i?86-*-* } } } } */
+/* On i386 we vectorize both the store and the reduction.  */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 "slp2" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 2552dacbd69..980d1e7267d 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1586,6 +1586,23 @@  bst_traits::equal (value_type existing, value_type candidate)
   return true;
 }
 
+typedef hash_map <vec <stmt_vec_info>, slp_tree,
+		  simple_hashmap_traits <bst_traits, slp_tree> >
+  scalar_stmts_to_slp_tree_map_t;
+
+/* Release BST_MAP.  */
+
+static void
+release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
+{
+  /* The map keeps a reference on SLP nodes built, release that.  */
+  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
+       it != bst_map->end (); ++it)
+    if ((*it).second)
+      vect_free_slp_tree ((*it).second);
+  delete bst_map;
+}
+
 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
    but then vec::insert does memmove and that's not compatible with
    std::pair.  */
@@ -1684,10 +1701,6 @@  vect_slp_linearize_chain (vec_info *vinfo,
     }
 }
 
-typedef hash_map <vec <stmt_vec_info>, slp_tree,
-		  simple_hashmap_traits <bst_traits, slp_tree> >
-  scalar_stmts_to_slp_tree_map_t;
-
 static slp_tree
 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 		       vec<stmt_vec_info> stmts, unsigned int group_size,
@@ -4308,14 +4321,7 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	}
     }
 
-
-
-  /* The map keeps a reference on SLP nodes built, release that.  */
-  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
-       it != bst_map->end (); ++it)
-    if ((*it).second)
-      vect_free_slp_tree ((*it).second);
-  delete bst_map;
+  release_scalar_stmts_to_slp_tree_map (bst_map);
 
   if (pattern_found && dump_enabled_p ())
     {
@@ -6373,6 +6379,43 @@  vect_optimize_slp_pass::run ()
   free_graph (m_slpg);
 }
 
+/* Apply CSE to NODE and its children using BST_MAP.  */
+
+static void
+vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
+{
+  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
+      && SLP_TREE_CODE (node) != VEC_PERM_EXPR
+      /* Besides some VEC_PERM_EXPR, two-operator nodes also
+	 lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
+	 we'd have sth that works for all internal and external nodes.  */
+      && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
+    {
+      if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
+	{
+	  if (*leader != node)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "re-using SLP tree %p for %p\n",
+				 (void *)*leader, (void *)node);
+	      vect_free_slp_tree (node);
+	      (*leader)->refcnt += 1;
+	      node = *leader;
+	    }
+	  return;
+	}
+
+      bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
+      node->refcnt += 1;
+      /* And recurse.  */
+    }
+
+  for (slp_tree &child : SLP_TREE_CHILDREN (node))
+    if (child)
+      vect_cse_slp_nodes (bst_map, child);
+}
+
 /* Optimize the SLP graph of VINFO.  */
 
 void
@@ -6381,6 +6424,16 @@  vect_optimize_slp (vec_info *vinfo)
   if (vinfo->slp_instances.is_empty ())
     return;
   vect_optimize_slp_pass (vinfo).run ();
+
+
+  /* Apply CSE again to nodes after permute optimization.  */
+  scalar_stmts_to_slp_tree_map_t *bst_map
+    = new scalar_stmts_to_slp_tree_map_t ();
+
+  for (auto inst : vinfo->slp_instances)
+    vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
+
+  release_scalar_stmts_to_slp_tree_map (bst_map);
 }
 
 /* Gather loads reachable from the individual SLP graph entries.  */