tree-optimization/102880 - improve CD-DCE

Message ID 68rp9374-2s2o-667r-7p40-212n8913181q@fhfr.qr
State Committed
Commit 045206450386bcd774db3bde0c696828402361c6
Headers
Series tree-optimization/102880 - improve CD-DCE |

Commit Message

Richard Biener Nov. 16, 2021, 10:31 a.m. UTC
  The PR shows a missed control-dependent DCE caused by CFG cleanup
merging a forwarder resulting in a partially degenerate PHI node.
With control-dependent DCE we need to mark control dependences
of incoming edges into PHIs as necessary but that is unnecessarily
conservative for the case when two edges have the same value.
There is no easy way to mark only a subset of control dependences
of both edges necessary so the fix is to produce forwarder blocks
where then the control dependence captures the requirements more
precisely.

For gcc.dg/tree-ssa/ssa-dom-thread-7.c the number of edges in the
CFG decrease as we have commonized PHI arguments which in turn
results in different threadings.  The testcase is too complex
and the dump scanning too simple to do anything meaningful here
but to adjust the number of expected threads.

The same CFG massaging could be useful at RTL expansion time to
reduce the number of copies we need to insert on edges.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2021-11-12  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/102880
	* tree-ssa-dce.c (sort_phi_args): New function.
	(make_forwarders_with_degenerate_phis): Likewise.
	(perform_tree_ssa_dce): Call
	make_forwarders_with_degenerate_phis.

	* gcc.dg/tree-ssa/pr102880.c: New testcase.
	* gcc.dg/tree-ssa/pr69270-3.c: Robustify.
	* gcc.dg/tree-ssa/ssa-dom-thread-7.c: Change the number of
	expected threadings.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr102880.c      |  27 +++
 gcc/testsuite/gcc.dg/tree-ssa/pr69270-3.c     |   2 +-
 .../gcc.dg/tree-ssa/ssa-dom-thread-7.c        |   2 +-
 gcc/tree-ssa-dce.c                            | 171 +++++++++++++++++-
 4 files changed, 196 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr102880.c
  

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr102880.c b/gcc/testsuite/gcc.dg/tree-ssa/pr102880.c
new file mode 100644
index 00000000000..0306deedb6c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr102880.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+void foo(void);
+
+static int b, c, d, e, f, ah;
+static short g, ai, am, aq, as;
+static char an, at, av, ax, ay;
+static char a(char h, char i) { return i == 0 || h && i == 1 ? 0 : h % i; }
+static void ae(int h) {
+  if (a(b, h))
+    foo();
+
+}
+int main() {
+  ae(1);
+  ay = a(0, ay);
+  ax = a(g, aq);
+  at = a(0, as);
+  av = a(c, 1);
+  an = a(am, f);
+  int al = e || ((a(1, ah) && b) & d) == 2;
+  ai = al;
+}
+
+/* We should eliminate the call to foo.  */
+/* { dg-final { scan-tree-dump-not "foo" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr69270-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr69270-3.c
index 89735f67de2..5ffd5f71506 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr69270-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr69270-3.c
@@ -3,7 +3,7 @@ 
 
 /* We're looking for a constant argument a PHI node.  There
    should only be one if we unpropagate correctly.  */
-/* { dg-final { scan-tree-dump-times ", 1" 1 "uncprop1"} } */
+/* { dg-final { scan-tree-dump-times "<1\|, 1" 1 "uncprop1"} } */
 
 typedef long unsigned int size_t;
 typedef union gimple_statement_d *gimple;
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
index d40a61fd725..b64e71dae22 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@@ -11,7 +11,7 @@ 
    to change decisions in switch expansion which in turn can expose new
    jump threading opportunities.  Skip the later tests on aarch64.  */
 /* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom3" { target { ! aarch64*-*-* } } } } */
-/* { dg-final { scan-tree-dump "Jumps threaded: 11"  "thread2" { target { ! aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump "Jumps threaded: 7"  "thread2" { target { ! aarch64*-*-* } } } } */
 /* { dg-final { scan-tree-dump "Jumps threaded: 18"  "thread2" { target { aarch64*-*-* } } } } */
 
 enum STATE {
diff --git a/gcc/tree-ssa-dce.c b/gcc/tree-ssa-dce.c
index 1281e67489c..dbf02c434de 100644
--- a/gcc/tree-ssa-dce.c
+++ b/gcc/tree-ssa-dce.c
@@ -67,6 +67,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-scalar-evolution.h"
 #include "tree-ssa-propagate.h"
 #include "gimple-fold.h"
+#include "tree-ssa.h"
 
 static struct stmt_stats
 {
@@ -1612,6 +1613,164 @@  tree_dce_done (bool aggressive)
   worklist.release ();
 }
 
+/* Sort PHI argument values for make_forwarders_with_degenerate_phis.  */
+
+static int
+sort_phi_args (const void *a_, const void *b_)
+{
+  auto *a = (const std::pair<edge, hashval_t> *) a_;
+  auto *b = (const std::pair<edge, hashval_t> *) b_;
+  hashval_t ha = a->second;
+  hashval_t hb = b->second;
+  if (ha < hb)
+    return -1;
+  else if (ha > hb)
+    return 1;
+  else
+    return 0;
+}
+
+/* Look for a non-virtual PHIs and make a forwarder block when all PHIs
+   have the same argument on a set of edges.  This is to not consider
+   control dependences of individual edges for same values but only for
+   the common set.  */
+
+static unsigned
+make_forwarders_with_degenerate_phis (function *fn)
+{
+  unsigned todo = 0;
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, fn)
+    {
+      /* Only PHIs with three or more arguments have opportunities.  */
+      if (EDGE_COUNT (bb->preds) < 3)
+	continue;
+      /* Do not touch loop headers.  */
+      if (bb->loop_father->header == bb)
+	continue;
+
+      /* Take one PHI node as template to look for identical
+	 arguments.  Build a vector of candidates forming sets
+	 of argument edges with equal values.  Note optimality
+	 depends on the particular choice of the template PHI
+	 since equal arguments are unordered leaving other PHIs
+	 with more than one set of equal arguments within this
+	 argument range unsorted.  We'd have to break ties by
+	 looking at other PHI nodes.  */
+      gphi_iterator gsi = gsi_start_nonvirtual_phis (bb);
+      if (gsi_end_p (gsi))
+	continue;
+      gphi *phi = gsi.phi ();
+      auto_vec<std::pair<edge, hashval_t>, 8> args;
+      for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+	{
+	  edge e = gimple_phi_arg_edge (phi, i);
+	  /* Skip abnormal edges since we cannot redirect them.  */
+	  if (e->flags & EDGE_ABNORMAL)
+	    continue;
+	  /* Skip loop exit edges when we are in loop-closed SSA form
+	     since the forwarder we'd create does not have a PHI node.  */
+	  if (loops_state_satisfies_p (LOOP_CLOSED_SSA)
+	      && loop_exit_edge_p (e->src->loop_father, e))
+	    continue;
+	  args.safe_push (std::make_pair (e, iterative_hash_expr
+					     (gimple_phi_arg_def (phi, i), 0)));
+	}
+      if (args.length () < 2)
+	continue;
+      args.qsort (sort_phi_args);
+
+      /* From the candidates vector now verify true candidates for
+	 forwarders and create them.  */
+      gphi *vphi = get_virtual_phi (bb);
+      unsigned start = 0;
+      while (start < args.length () - 1)
+	{
+	  unsigned i;
+	  for (i = start + 1; i < args.length (); ++i)
+	    if (!operand_equal_p (PHI_ARG_DEF_FROM_EDGE (phi, args[start].first),
+				  PHI_ARG_DEF_FROM_EDGE (phi, args[i].first)))
+	      break;
+	  /* args[start]..args[i-1] are equal.  */
+	  if (start != i - 1)
+	    {
+	      /* Check all PHI nodes for argument equality.  */
+	      bool equal = true;
+	      gphi_iterator gsi2 = gsi;
+	      gsi_next (&gsi2);
+	      for (; !gsi_end_p (gsi2); gsi_next (&gsi2))
+		{
+		  gphi *phi2 = gsi2.phi ();
+		  if (virtual_operand_p (gimple_phi_result (phi2)))
+		    continue;
+		  tree start_arg
+		    = PHI_ARG_DEF_FROM_EDGE (phi2, args[start].first);
+		  for (unsigned j = start + 1; j < i; ++j)
+		    {
+		      if (!operand_equal_p (start_arg,
+					    PHI_ARG_DEF_FROM_EDGE
+					      (phi2, args[j].first)))
+			{
+			  /* Another PHI might have a shorter set of
+			     equivalent args.  Go for that.  */
+			  i = j;
+			  if (j == start + 1)
+			    equal = false;
+			  break;
+			}
+		    }
+		  if (!equal)
+		    break;
+		}
+	      if (equal)
+		{
+		  /* If we are asked to forward all edges the block
+		     has all degenerate PHIs.  Do nothing in that case.  */
+		  if (start == 0
+		      && i == args.length ()
+		      && args.length () == gimple_phi_num_args (phi))
+		    break;
+		  /* Instead of using make_forwarder_block we are
+		     rolling our own variant knowing that the forwarder
+		     does not need PHI nodes apart from eventually
+		     a virtual one.  */
+		  auto_vec<tree, 8> vphi_args;
+		  if (vphi)
+		    {
+		      vphi_args.reserve_exact (i - start);
+		      for (unsigned j = start; j < i; ++j)
+			vphi_args.quick_push
+			  (PHI_ARG_DEF_FROM_EDGE (vphi, args[j].first));
+		    }
+		  free_dominance_info (fn, CDI_DOMINATORS);
+		  basic_block forwarder = split_edge (args[start].first);
+		  for (unsigned j = start + 1; j < i; ++j)
+		    {
+		      edge e = args[j].first;
+		      redirect_edge_and_branch_force (e, forwarder);
+		      redirect_edge_var_map_clear (e);
+		    }
+		  if (vphi)
+		    {
+		      tree def = copy_ssa_name (vphi_args[0]);
+		      gphi *vphi_copy = create_phi_node (def, forwarder);
+		      for (unsigned j = start; j < i; ++j)
+			add_phi_arg (vphi_copy, vphi_args[j - start],
+				     args[j].first, UNKNOWN_LOCATION);
+		      SET_PHI_ARG_DEF
+			(vphi, single_succ_edge (forwarder)->dest_idx, def);
+		    }
+		  todo |= TODO_cleanup_cfg;
+		}
+	    }
+	  /* Continue searching for more opportunities.  */
+	  start = i;
+	}
+    }
+  return todo;
+}
+
 /* Main routine to eliminate dead code.
 
    AGGRESSIVE controls the aggressiveness of the algorithm.
@@ -1630,8 +1789,7 @@  static unsigned int
 perform_tree_ssa_dce (bool aggressive)
 {
   bool something_changed = 0;
-
-  calculate_dominance_info (CDI_DOMINATORS);
+  unsigned todo = 0;
 
   /* Preheaders are needed for SCEV to work.
      Simple lateches and recorded exits improve chances that loop will
@@ -1644,6 +1802,11 @@  perform_tree_ssa_dce (bool aggressive)
 			   | LOOPS_HAVE_RECORDED_EXITS);
     }
 
+  if (aggressive)
+    todo |= make_forwarders_with_degenerate_phis (cfun);
+
+  calculate_dominance_info (CDI_DOMINATORS);
+
   tree_dce_init (aggressive);
 
   if (aggressive)
@@ -1701,9 +1864,9 @@  perform_tree_ssa_dce (bool aggressive)
       free_numbers_of_iterations_estimates (cfun);
       if (in_loop_pipeline)
 	scev_reset ();
-      return TODO_update_ssa | TODO_cleanup_cfg;
+      todo |= TODO_update_ssa | TODO_cleanup_cfg;
     }
-  return 0;
+  return todo;
 }
 
 /* Pass entry points.  */