backwards threader costing TLC and a fix

Message ID 20220803095030.B28ED38582A4@sourceware.org
State New
Headers
Series backwards threader costing TLC and a fix |

Commit Message

Richard Biener Aug. 3, 2022, 9:49 a.m. UTC
  The previous change to the backwards threader costing contained a
mistake that can make us reject a path based on size when the
full thread path is not know yet and the full path would be considered
hot but the partial path not yet.

Instead of adding another simple fix for this particular issue I
took the approach of making sure profitable_path_p is only called
for the full path we finally consider threading and split out the
size computation part that is useful to limit the greedy search
early.  That removes simplifies profitable_path_p.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Any objections or comments?

Thanks,
Richard.

	* tree-ssa-threadbackward.cc
	(back_threader::m_path_insns): New.
	(back_threader::back_threader): Initialize m_path_insns.
	(back_threader_profitability::profitable_path_p): Take
	n_insns, do not take name as argument.  taken and
	irreducible_loop are now always passed and not NULL.
	Split out path size computation to ...
	(back_threader::find_paths_to_names): ... the greedy
	search, applying the number of block and size limits here.
	Do not call profitable_path_p before calling
	maybe_register_path.
	(back_threader::maybe_register_path): Pass m_path_insns
	rather than m_name to profitable_path_p.
---
 gcc/tree-ssa-threadbackward.cc | 337 ++++++++++++++++-----------------
 1 file changed, 160 insertions(+), 177 deletions(-)
  

Patch

diff --git a/gcc/tree-ssa-threadbackward.cc b/gcc/tree-ssa-threadbackward.cc
index ba114e98a41..3fb05c4c75b 100644
--- a/gcc/tree-ssa-threadbackward.cc
+++ b/gcc/tree-ssa-threadbackward.cc
@@ -64,8 +64,8 @@  public:
   back_threader_profitability (bool speed_p)
     : m_speed_p (speed_p)
   { }
-  bool profitable_path_p (const vec<basic_block> &, tree name, edge taken,
-			  bool *irreducible_loop = NULL);
+  bool profitable_path_p (const vec<basic_block> &, int n_insns,
+			  edge taken, bool *irreducible_loop);
 private:
   const bool m_speed_p;
 };
@@ -104,6 +104,7 @@  private:
 
   // Current path being analyzed.
   auto_vec<basic_block> m_path;
+  int m_path_insns;
   // Hash to mark visited BBs while analyzing a path.
   hash_set<basic_block> m_visited_bbs;
   // The set of SSA names, any of which could potentially change the
@@ -133,6 +134,7 @@  const edge back_threader::UNREACHABLE_EDGE = (edge) -1;
 
 back_threader::back_threader (function *fun, unsigned flags, bool first)
   : m_profit (flags & BT_SPEED),
+    m_path_insns (0),
     m_first (first)
 {
   if (flags & BT_SPEED)
@@ -242,8 +244,8 @@  back_threader::maybe_register_path ()
       else
 	{
 	  bool irreducible = false;
-	  if (m_profit.profitable_path_p (m_path, m_name, taken_edge,
-					  &irreducible)
+	  if (m_profit.profitable_path_p (m_path, m_path_insns,
+					  taken_edge, &irreducible)
 	      && debug_counter ())
 	    {
 	      m_registry.register_path (m_path, taken_edge);
@@ -411,58 +413,142 @@  back_threader::find_paths_to_names (basic_block bb, bitmap interesting)
   if (m_visited_bbs.add (bb))
     return;
 
-  m_path.safe_push (bb);
-
-  // Try to resolve the path without looking back.
-  if (m_path.length () > 1
-      && (!m_profit.profitable_path_p (m_path, m_name, NULL)
-	  || maybe_register_path ()))
+  if (m_path.length () >= (unsigned) param_max_fsm_thread_length)
     {
-      m_path.pop ();
-      m_visited_bbs.remove (bb);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "  FAIL: Jump-thread path not considered: "
+		 "the number of basic blocks on the path "
+		 "exceeds PARAM_MAX_FSM_THREAD_LENGTH.\n");
       return;
     }
 
-  auto_bitmap processed;
-  bool done = false;
-  // We use a worklist instead of iterating through the bitmap,
-  // because we may add new items in-flight.
-  auto_vec<tree> worklist (bitmap_count_bits (interesting));
-  populate_worklist (worklist, interesting);
-  while (!worklist.is_empty ())
+  int old_path_insns = m_path_insns;
+  /* When pushing BB we have to add the previous block size since that is
+     now part of the thread.  When this is the first block do not account
+     for the control stmt at the end since that is going to be elided.
+     ???  It might be worth caching the basic-block sizes since we can
+     end up visiting it more than once.  Slight complication arises from
+     the special-casing of m_name.  */
+  if (!m_path.is_empty ())
     {
-      tree name = worklist.pop ();
-      unsigned i = SSA_NAME_VERSION (name);
-      gimple *def_stmt = SSA_NAME_DEF_STMT (name);
-      basic_block def_bb = gimple_bb (def_stmt);
-
-      // Process any PHIs defined in this block.
-      if (def_bb == bb
-	  && bitmap_set_bit (processed, i)
-	  && gimple_code (def_stmt) == GIMPLE_PHI)
+      basic_block new_bb = m_path.last ();
+      int new_bb_insns = 0;
+      /* PHIs in the path will create degenerate PHIS in the
+	 copied path which will then get propagated away, so
+	 looking at just the duplicate path the PHIs would
+	 seem unimportant.
+
+	 But those PHIs, because they're assignments to objects
+	 typically with lives that exist outside the thread path,
+	 will tend to generate PHIs (or at least new PHI arguments)
+	 at points where we leave the thread path and rejoin
+	 the original blocks.  So we do want to account for them.
+
+	 We ignore virtual PHIs.  We also ignore cases where BB
+	 has a single incoming edge.  That's the most common
+	 degenerate PHI we'll see here.  Finally we ignore PHIs
+	 that are associated with the value we're tracking as
+	 that object likely dies.  */
+      if (EDGE_COUNT (new_bb->succs) > 1 && EDGE_COUNT (new_bb->preds) > 1)
 	{
-	  resolve_phi (as_a<gphi *> (def_stmt), interesting);
-	  done = true;
-	  break;
+	  for (gphi_iterator gsip = gsi_start_phis (new_bb); !gsi_end_p (gsip);
+	       gsi_next (&gsip))
+	    {
+	      gphi *phi = gsip.phi ();
+	      tree dst = gimple_phi_result (phi);
+
+	      /* Note that if both NAME and DST are anonymous
+		 SSA_NAMEs, then we do not have enough information
+		 to consider them associated.  */
+	      if (dst != m_name
+		  && m_name
+		  && TREE_CODE (m_name) == SSA_NAME
+		  && (SSA_NAME_VAR (dst) != SSA_NAME_VAR (m_name)
+		      || !SSA_NAME_VAR (dst))
+		  && !virtual_operand_p (dst))
+		++new_bb_insns;
+	    }
 	}
+      gimple *elided = (m_path.length () == 1) ? last_stmt (new_bb) : NULL;
+      for (gimple_stmt_iterator gsi = gsi_after_labels (new_bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  /* Do not allow OpenACC loop markers and __builtin_constant_p on
+	     threading paths.  The latter is disallowed, because an
+	     expression might be constant on two threading paths, and
+	     become non-constant (i.e.: phi) when they merge.  */
+	  gimple *stmt = gsi_stmt (gsi);
+	  if (gimple_call_internal_p (stmt, IFN_UNIQUE)
+	      || gimple_call_builtin_p (stmt, BUILT_IN_CONSTANT_P))
+	    return;
+	  if (stmt != elided)
+	    new_bb_insns += estimate_num_insns (stmt, &eni_size_weights);
+	}
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Basic block %d adds %d insns to the path\n",
+		 new_bb->index, new_bb_insns);
+      if (m_path_insns + new_bb_insns >= param_max_fsm_thread_path_insns)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "  FAIL: Jump-thread path not considered: "
+		     "the number of instructions on the path "
+		     "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
+	  return;
+	}
+      m_path_insns += new_bb_insns;
     }
-  // If there are interesting names not yet processed, keep looking.
-  if (!done)
+  m_path.safe_push (bb);
+
+  // Try to resolve the path without looking back further.
+  if (m_path.length () > 1 && maybe_register_path ())
+    ;
+
+  else
     {
-      bitmap_and_compl_into (interesting, processed);
-      if (!bitmap_empty_p (interesting))
+      auto_bitmap processed;
+      bool done = false;
+      // We use a worklist instead of iterating through the bitmap,
+      // because we may add new items in-flight.
+      auto_vec<tree> worklist (bitmap_count_bits (interesting));
+      populate_worklist (worklist, interesting);
+      while (!worklist.is_empty ())
+	{
+	  tree name = worklist.pop ();
+	  unsigned i = SSA_NAME_VERSION (name);
+	  gimple *def_stmt = SSA_NAME_DEF_STMT (name);
+	  basic_block def_bb = gimple_bb (def_stmt);
+
+	  // Process any PHIs defined in this block.
+	  if (def_bb == bb
+	      && bitmap_set_bit (processed, i)
+	      && gimple_code (def_stmt) == GIMPLE_PHI)
+	    {
+	      resolve_phi (as_a<gphi *> (def_stmt), interesting);
+	      done = true;
+	      break;
+	    }
+	}
+      // If there are interesting names not yet processed, keep looking.
+      if (!done)
 	{
-	  edge_iterator iter;
-	  edge e;
-	  FOR_EACH_EDGE (e, iter, bb->preds)
-	    if ((e->flags & EDGE_ABNORMAL) == 0)
-	      find_paths_to_names (e->src, interesting);
+	  bitmap_and_compl_into (interesting, processed);
+	  if (!bitmap_empty_p (interesting))
+	    {
+	      edge_iterator iter;
+	      edge e;
+	      FOR_EACH_EDGE (e, iter, bb->preds)
+		if ((e->flags & EDGE_ABNORMAL) == 0)
+		  find_paths_to_names (e->src, interesting);
+	    }
 	}
+
+      // Reset things to their original state.
+      bitmap_ior_into (interesting, processed);
     }
 
-  // Reset things to their original state.
-  bitmap_ior_into (interesting, processed);
+  /* Unwind from adding BB.  */
   m_path.pop ();
+  m_path_insns = old_path_insns;
   m_visited_bbs.remove (bb);
 }
 
@@ -572,13 +658,11 @@  back_threader::debug ()
 /* Examine jump threading path PATH and return TRUE if it is profitable to
    thread it, otherwise return FALSE.
 
-   NAME is the SSA_NAME of the variable we found to have a constant
-   value on PATH.  If unknown, SSA_NAME is NULL.
+   N_INSNS is the number of instructions on the threaded path.
 
-   If the taken edge out of the path is known ahead of time it is passed in
-   TAKEN_EDGE, otherwise it is NULL.
+   TAKEN_EDGE is the edge out of the path known to be taken.
 
-   CREATES_IRREDUCIBLE_LOOP, if non-null is set to TRUE if threading this path
+   *CREATES_IRREDUCIBLE_LOOP is set to TRUE if threading this path
    would create an irreducible loop.
 
    ?? It seems we should be able to loosen some of the restrictions in
@@ -586,35 +670,12 @@  back_threader::debug ()
 
 bool
 back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
-						tree name,
+						int n_insns,
 						edge taken_edge,
 						bool *creates_irreducible_loop)
 {
-  gcc_checking_assert (!m_path.is_empty ());
-
-  /* We can an empty path here (excluding the DEF block) when the
-     statement that makes a conditional generate a compile-time
-     constant result is in the same block as the conditional.
-
-     That's not really a jump threading opportunity, but instead is
-     simple cprop & simplification.  We could handle it here if we
-     wanted by wiring up all the incoming edges.  If we run this
-     early in IPA, that might be worth doing.   For now we just
-     reject that case.  */
-  if (m_path.length () <= 1)
-      return false;
+  gcc_assert (m_path.length () > 1);
 
-  if (m_path.length () > (unsigned) param_max_fsm_thread_length)
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "  FAIL: Jump-thread path not considered: "
-		 "the number of basic blocks on the path "
-		 "exceeds PARAM_MAX_FSM_THREAD_LENGTH.\n");
-      return false;
-    }
-
-  int n_insns = 0;
-  gimple_stmt_iterator gsi;
   loop_p loop = m_path[0]->loop_father;
   bool threaded_through_latch = false;
   bool multiway_branch_in_path = false;
@@ -624,10 +685,7 @@  back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "Checking profitability of path (backwards): ");
 
-  /* Count the number of instructions on the path: as these instructions
-     will have to be duplicated, we will not record the path if there
-     are too many instructions on the path.  Also check that all the
-     blocks in the path belong to a single loop.  */
+  /* Check CFG properties of the path.  */
   for (unsigned j = 0; j < m_path.length (); j++)
     {
       basic_block bb = m_path[j];
@@ -642,66 +700,8 @@  back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
 	 it ends in a multiway branch.  */
       if (j < m_path.length () - 1)
 	{
-	  int orig_n_insns = n_insns;
-	  /* PHIs in the path will create degenerate PHIS in the
-	     copied path which will then get propagated away, so
-	     looking at just the duplicate path the PHIs would
-	     seem unimportant.
-
-	     But those PHIs, because they're assignments to objects
-	     typically with lives that exist outside the thread path,
-	     will tend to generate PHIs (or at least new PHI arguments)
-	     at points where we leave the thread path and rejoin
-	     the original blocks.  So we do want to account for them.
-
-	     We ignore virtual PHIs.  We also ignore cases where BB
-	     has a single incoming edge.  That's the most common
-	     degenerate PHI we'll see here.  Finally we ignore PHIs
-	     that are associated with the value we're tracking as
-	     that object likely dies.  */
-	  if (EDGE_COUNT (bb->succs) > 1 && EDGE_COUNT (bb->preds) > 1)
-	    {
-	      for (gphi_iterator gsip = gsi_start_phis (bb);
-		   !gsi_end_p (gsip);
-		   gsi_next (&gsip))
-		{
-		  gphi *phi = gsip.phi ();
-		  tree dst = gimple_phi_result (phi);
-
-		  /* Note that if both NAME and DST are anonymous
-		     SSA_NAMEs, then we do not have enough information
-		     to consider them associated.  */
-		  if (dst != name
-		      && name
-		      && TREE_CODE (name) == SSA_NAME
-		      && (SSA_NAME_VAR (dst) != SSA_NAME_VAR (name)
-			  || !SSA_NAME_VAR (dst))
-		      && !virtual_operand_p (dst))
-		    ++n_insns;
-		}
-	    }
-
 	  if (!contains_hot_bb && m_speed_p)
 	    contains_hot_bb |= optimize_bb_for_speed_p (bb);
-	  for (gsi = gsi_after_labels (bb);
-	       !gsi_end_p (gsi);
-	       gsi_next_nondebug (&gsi))
-	    {
-	      /* Do not allow OpenACC loop markers and __builtin_constant_p on
-		 threading paths.  The latter is disallowed, because an
-		 expression might be constant on two threading paths, and
-		 become non-constant (i.e.: phi) when they merge.  */
-	      gimple *stmt = gsi_stmt (gsi);
-	      if (gimple_call_internal_p (stmt, IFN_UNIQUE)
-		  || gimple_call_builtin_p (stmt, BUILT_IN_CONSTANT_P))
-		return false;
-	      /* Do not count empty statements and labels.  */
-	      if (gimple_code (stmt) != GIMPLE_NOP
-		  && !is_gimple_debug (stmt))
-		n_insns += estimate_num_insns (stmt, &eni_size_weights);
-	    }
-	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, " (%i insns)", n_insns-orig_n_insns);
 
 	  /* We do not look at the block with the threaded branch
 	     in this loop.  So if any block with a last statement that
@@ -711,8 +711,9 @@  back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
 	     The block in PATH[0] is special, it's the block were we're
 	     going to be able to eliminate its branch.  */
 	  gimple *last = last_stmt (bb);
-	  if (last && (gimple_code (last) == GIMPLE_SWITCH
-		       || gimple_code (last) == GIMPLE_GOTO))
+	  if (last
+	      && (gimple_code (last) == GIMPLE_SWITCH
+		  || gimple_code (last) == GIMPLE_GOTO))
 	    {
 	      if (j == 0)
 		threaded_multiway_branch = true;
@@ -732,52 +733,34 @@  back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
 	}
     }
 
-  gimple *stmt = get_gimple_control_stmt (m_path[0]);
-  gcc_assert (stmt);
-
-  /* We are going to remove the control statement at the end of the
-     last block in the threading path.  So don't count it against our
-     statement count.  */
-
-  int stmt_insns = estimate_num_insns (stmt, &eni_size_weights);
-  n_insns-= stmt_insns;
-
   if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "\n  Control statement insns: %i\n"
-	     "  Overall: %i insns\n",
-	     stmt_insns, n_insns);
-
-  if (creates_irreducible_loop)
-    {
-      /* If this path threaded through the loop latch back into the
-	 same loop and the destination does not dominate the loop
-	 latch, then this thread would create an irreducible loop.  */
-      *creates_irreducible_loop = false;
-      if (taken_edge
-	  && threaded_through_latch
-	  && loop == taken_edge->dest->loop_father
-	  && (determine_bb_domination_status (loop, taken_edge->dest)
-	      == DOMST_NONDOMINATING))
-	*creates_irreducible_loop = true;
-    }
+    fprintf (dump_file, "\n"
+	     "  Overall: %i insns\n", n_insns);
+
+  /* If this path threaded through the loop latch back into the
+     same loop and the destination does not dominate the loop
+     latch, then this thread would create an irreducible loop.  */
+  *creates_irreducible_loop = false;
+  if (threaded_through_latch
+      && loop == taken_edge->dest->loop_father
+      && (determine_bb_domination_status (loop, taken_edge->dest)
+	  == DOMST_NONDOMINATING))
+    *creates_irreducible_loop = true;
 
   /* Threading is profitable if the path duplicated is hot but also
      in a case we separate cold path from hot path and permit optimization
      of the hot path later.  Be on the agressive side here. In some testcases,
      as in PR 78407 this leads to noticeable improvements.  */
   if (m_speed_p
-      && ((taken_edge && optimize_edge_for_speed_p (taken_edge))
+      && (optimize_edge_for_speed_p (taken_edge)
 	  || contains_hot_bb))
     {
-      if (n_insns >= param_max_fsm_thread_path_insns)
-	{
-	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "  FAIL: Jump-thread path not considered: "
-		     "the number of instructions on the path "
-		     "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
-	  return false;
-	}
-      if (taken_edge && probably_never_executed_edge_p (cfun, taken_edge))
+      /* n_insns is already limited to param_max_fsm_thread_path_insns
+	 to limit the greedy search in find_paths_to_names.  */
+      /* Avoid separating cold paths when that is predicted to probably
+	 never execute since optimizing it may lead to bogus diagnostics
+	 like for PR106495 and PR105679.  */
+      if (probably_never_executed_edge_p (cfun, taken_edge))
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    fprintf (dump_file, "  FAIL: Jump-thread path not considered: "
@@ -813,7 +796,6 @@  back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
      optimizer would have done anyway, so an irreducible loop is not
      so bad.  */
   if (!threaded_multiway_branch
-      && creates_irreducible_loop
       && *creates_irreducible_loop
       && (n_insns * (unsigned) param_fsm_scale_path_stmts
 	  > (m_path.length () *
@@ -860,16 +842,17 @@  back_threader_profitability::profitable_path_p (const vec<basic_block> &m_path,
      loop optimizations to fail.  Disable these threads until after
      loop optimizations have run.  */
   if ((threaded_through_latch
-       || (taken_edge && taken_edge->dest == loop->latch))
+       || taken_edge->dest == loop->latch)
       && !(cfun->curr_properties & PROP_loop_opts_done)
       && empty_block_p (loop->latch))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file,
-		 "  FAIL: Thread through latch before loop opts would create non-empty latch\n");
+		 "  FAIL: Thread through latch before loop opts would create a "
+		 "non-empty latch\n");
       return false;
-
     }
+
   return true;
 }