PING: [PATCH/RFC 2/2] WPD: Enable whole program devirtualization at LTRANS

Message ID SN6PR01MB49585DB63E96F1A3FBD513E3F7AA9@SN6PR01MB4958.prod.exchangelabs.com
State New
Headers
Series PING: [PATCH/RFC 2/2] WPD: Enable whole program devirtualization at LTRANS |

Commit Message

Feng Xue OS Sept. 30, 2021, 3:59 a.m. UTC
  Made some minor changes.

Thanks,
Feng
  

Patch

From 2c0d243b0c092585561c732bac490700f41001fb Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Mon, 6 Sep 2021 20:34:50 +0800
Subject: [PATCH 2/2] WPD: Enable whole program devirtualization at LTRANS

Whole program assumption would not hold when WPA splits whole compilation
into more than one LTRANS partitions. To avoid information lost for WPD
at LTRANS, we will record all vtable nodes and related member function
references into each partition.

2021-09-07  Feng Xue  <fxue@os.amperecomputing.com>

gcc/
	* tree.h (TYPE_CXX_LOCAL): New macro for type using
	base.nothrow_flag.
       	* tree-core.h (tree_base): Update comment on using
	base.nothrow_flag to represent TYPE_CXX_LOCAL.
	* ipa-devirt.c (odr_type_d::whole_program_local): Removed.
        (odr_type_d::whole_program_local_p): Check TYPE_CXX_LOCAL flag
	on type, and enable WPD at LTRANS when flag_devirtualize_fully
	is true.
        (get_odr_type): Remove setting whole_program_local flag on type.
        (identify_whole_program_local_types): Replace whole_program_local
	in odr_type_d by TYPE_CXX_LOCAL on type.
        (maybe_record_node): Enable WPD at LTRANS when
	flag_devirtualize_fully	is true.
        * ipa.c (can_remove_vtable_if_no_refs_p): Retain vtables at LTRANS
	stage under full devirtualization.
        * lto-cgraph.c (compute_ltrans_boundary): Add all defined vtables
	to boundary of each LTRANS partition.
	* lto-streamer-out.c (get_symbol_initial_value): Streaming out
	initial	value of vtable even its class is optimized away.
	* lto-streamer-in.c (lto_input_tree): There might be more than
	one decls in dref_queue, register debuginfo for all of them.
	* lto-lang.c (lto_post_options): Disable full devirtualization
	if flag_ltrans_devirtualize is false.
	* tree-streamer-in.c (unpack_ts_base_value_fields): unpack value
	of TYPE_CXX_LOCAL for a type from streaming data.
	* tree-streamer-out.c (pack_ts_base_value_fields): pack value
	ofTYPE_CXX_LOCAL for a type into streaming data.

temp
---
 gcc/ipa-devirt.c        | 29 ++++++++++++++++++-----------
 gcc/ipa.c               |  7 ++++++-
 gcc/lto-cgraph.c        | 18 ++++++++++++++++++
 gcc/lto-streamer-in.c   |  3 +--
 gcc/lto-streamer-out.c  | 12 +++++++++++-
 gcc/lto/lto-lang.c      |  6 ++++++
 gcc/tree-core.h         |  3 +++
 gcc/tree-streamer-in.c  | 11 ++++++++---
 gcc/tree-streamer-out.c | 11 ++++++++---
 gcc/tree.h              |  5 +++++
 10 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c
index a7d04388dab..4ff551bace8 100644
--- a/gcc/ipa-devirt.c
+++ b/gcc/ipa-devirt.c
@@ -216,8 +216,6 @@  struct GTY(()) odr_type_d
   int id;
   /* Is it in anonymous namespace? */
   bool anonymous_namespace;
-  /* Set when type is not used outside of program.  */
-  bool whole_program_local;
   /* Did we report ODR violation here?  */
   bool odr_violated;
   /* Set when virtual table without RTTI prevailed table with.  */
@@ -290,10 +288,18 @@  get_type_vtable (tree type)
 bool
 odr_type_d::whole_program_local_p ()
 {
-  if (flag_ltrans)
+  if (flag_ltrans && !flag_devirtualize_fully)
     return false;
 
-  return whole_program_local;
+  if (in_lto_p)
+    return TYPE_CXX_LOCAL (type);
+
+  /* Although a local class is always considered as whole program local in
+     LGEN stage, but may not in LTO stage if multiple duplicated primary
+     vtables are attached to the class due to C++ privatizing via -fno-weak.
+     Thus, we can not set TYPE_CXX_LOCAL flag for local class at LGEN stage
+     when building ORD type.  */
+  return anonymous_namespace || decl_function_context (TYPE_NAME (type));
 }
 
 /* Return TRUE if ODR type may have any instance.  */
@@ -2007,11 +2013,6 @@  get_odr_type (tree type, bool insert)
       else
 	val->anonymous_namespace = 0;
 
-      if (!in_lto_p
-	  && (val->anonymous_namespace
-	      || decl_function_context (TYPE_NAME (type))))
-	val->whole_program_local = true;
-
       build_bases = COMPLETE_TYPE_P (val->type);
       insert_to_odr_array = true;
       *slot = val;
@@ -2607,9 +2608,15 @@  identify_whole_program_local_types (void)
 
 	  if (multi_vtable_p)
 	    continue;
+
+	  /* Mark all equivalent types in the ODR type as whole program local,
+	     because representative type of the ODR type at LTRANS might not
+	     be the one at WPA.  */
+	  FOR_EACH_VEC_ELT (*(type->types), i, equiv_type)
+	    TYPE_CXX_LOCAL (equiv_type) = 1;
 	}
 
-      type->whole_program_local = true;
+      TYPE_CXX_LOCAL (type->type) = 1;
     }
 
   delete no_rtti_files;
@@ -2726,7 +2733,7 @@  maybe_record_node (vec <cgraph_node *> &nodes,
      Currently we ignore these functions in speculative devirtualization.
      ??? Maybe it would make sense to be more aggressive for LTO even
      elsewhere.  */
-  if (!flag_ltrans
+  if ((!flag_ltrans || flag_devirtualize_fully)
       && !pure_virtual
       && type_in_anonymous_namespace_p (DECL_CONTEXT (target))
       && (!target_node
diff --git a/gcc/ipa.c b/gcc/ipa.c
index 6e200a906b5..52d6a9e63be 100644
--- a/gcc/ipa.c
+++ b/gcc/ipa.c
@@ -268,7 +268,7 @@  can_remove_vtable_if_no_refs_p (varpool_node *vnode)
   if (!flag_devirtualize_fully)
     return true;
 
-  if (DECL_EXTERNAL (vnode->decl))
+  if (DECL_EXTERNAL (vnode->decl) && !vnode->in_other_partition)
     return true;
 
   /* We will force generating vtables in LGEN stage even they are "unused",
@@ -276,6 +276,11 @@  can_remove_vtable_if_no_refs_p (varpool_node *vnode)
   if (!in_lto_p && flag_generate_lto)
     return false;
 
+  /* All vtables seen at LTRANS stage are result of dead class elimination
+     at WPA, no need to prune them further.  */
+  if (flag_ltrans)
+    return false;
+
   return true;
 }
 
diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 7c3e276a8ea..12c82eebc3f 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -859,6 +859,24 @@  compute_ltrans_boundary (lto_symtab_encoder_t in_encoder)
       lto_set_symtab_encoder_encode_initializer (encoder, vnode);
       create_references (encoder, vnode);
     }
+
+  /* Add all defined vtables to streaming encoder, so that we could
+     reconstruct whole type inheritance graph in LTRANS as what it was in
+     WPA to enable late full devirtualization.
+
+     TODO: For a partition, only need to encode a subset of vtables, not
+     all of them.  */
+  if (flag_devirtualize_fully && flag_ltrans_devirtualize && flag_wpa)
+    {
+      varpool_node *vnode;
+
+      FOR_EACH_DEFINED_VARIABLE (vnode)
+	if (DECL_VIRTUAL_P (vnode->decl) && !DECL_EXTERNAL (vnode->decl)
+	    && TYPE_CXX_LOCAL (DECL_CONTEXT (vnode->decl))
+	    && lto_symtab_encoder_lookup (encoder, vnode) == LCC_NOT_FOUND)
+	  lto_symtab_encoder_encode (encoder, vnode);
+    }
+
   /* Pickle in also the initializer of all referenced readonly variables
      to help folding.  Constant pool variables are not shared, so we must
      pickle those too.  */
diff --git a/gcc/lto-streamer-in.c b/gcc/lto-streamer-in.c
index eb8a7dc57b0..b0902a28d59 100644
--- a/gcc/lto-streamer-in.c
+++ b/gcc/lto-streamer-in.c
@@ -1905,11 +1905,10 @@  lto_input_tree (class lto_input_block *ib, class data_in *data_in)
     }
   tree t = lto_input_tree_1 (ib, data_in, tag, 0);
 
-  if (!dref_queue.is_empty ())
+  while (!dref_queue.is_empty ())
     {
       dref_entry e = dref_queue.pop ();
       debug_hooks->register_external_die (e.decl, e.sym, e.off);
-      gcc_checking_assert (dref_queue.is_empty ());
     }
   return t;
 }
diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
index a26d4885800..c36efb2e25c 100644
--- a/gcc/lto-streamer-out.c
+++ b/gcc/lto-streamer-out.c
@@ -437,7 +437,17 @@  get_symbol_initial_value (lto_symtab_encoder_t encoder, tree expr)
 	 scalar values.  */
       if (!(vnode = varpool_node::get (expr))
 	  || !lto_symtab_encoder_encode_initializer_p (encoder, vnode))
-        initial = error_mark_node;
+	{
+	  /* Even when a class is optimized away, devirtualization at LTRANS
+	     still needs to extract addresses of member virtual functions from
+	     initial value of vtable.  */
+	  if (flag_devirtualize_fully && flag_ltrans_devirtualize
+	      && flag_wpa && DECL_VIRTUAL_P (expr) && !DECL_EXTERNAL (expr))
+	    return initial;
+
+	  initial = error_mark_node;
+	}
+
       if (initial != error_mark_node)
 	{
 	  long max_size = 30;
diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c
index c13c7e45ac1..a48b55f7ada 100644
--- a/gcc/lto/lto-lang.c
+++ b/gcc/lto/lto-lang.c
@@ -870,6 +870,12 @@  lto_post_options (const char **pfilename ATTRIBUTE_UNUSED)
       /* During LTRANS, we are not looking at the whole program, only
 	 a subset of the whole callgraph.  */
       flag_whole_program = 0;
+
+      /* Since full devirtualization at LTRANS requires streaming extra data
+	 on vtables, it only takes effect when flag_ltrans_devirtualize is
+	 also on.  */
+      if (!flag_ltrans_devirtualize)
+	flag_devirtualize_fully = 0;
     }
 
   if (flag_wpa)
diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index 4498c3ad127..08d59af022b 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -1328,6 +1328,9 @@  struct GTY(()) tree_base {
        DECL_NONALIASED in
 	  VAR_DECL
 
+       TYPE_CXX_LOCAL in
+	   all types
+
    deprecated_flag:
 
        TREE_DEPRECATED in
diff --git a/gcc/tree-streamer-in.c b/gcc/tree-streamer-in.c
index 984b1e269cf..1597fc102b0 100644
--- a/gcc/tree-streamer-in.c
+++ b/gcc/tree-streamer-in.c
@@ -129,10 +129,15 @@  unpack_ts_base_value_fields (struct bitpack_d *bp, tree expr)
     bp_unpack_value (bp, 1);
   TREE_ASM_WRITTEN (expr) = (unsigned) bp_unpack_value (bp, 1);
   if (TYPE_P (expr))
-    TYPE_ARTIFICIAL (expr) = (unsigned) bp_unpack_value (bp, 1);
+    {
+      TYPE_ARTIFICIAL (expr) = (unsigned) bp_unpack_value (bp, 1);
+      TYPE_CXX_LOCAL (expr) = (unsigned) bp_unpack_value (bp, 1);
+    }
   else
-    TREE_NO_WARNING (expr) = (unsigned) bp_unpack_value (bp, 1);
-  TREE_NOTHROW (expr) = (unsigned) bp_unpack_value (bp, 1);
+    {
+      TREE_NO_WARNING (expr) = (unsigned) bp_unpack_value (bp, 1);
+      TREE_NOTHROW (expr) = (unsigned) bp_unpack_value (bp, 1);
+    }
   TREE_STATIC (expr) = (unsigned) bp_unpack_value (bp, 1);
   if (TREE_CODE (expr) != TREE_BINFO)
     TREE_PRIVATE (expr) = (unsigned) bp_unpack_value (bp, 1);
diff --git a/gcc/tree-streamer-out.c b/gcc/tree-streamer-out.c
index 1a43534d117..7ec719ed59b 100644
--- a/gcc/tree-streamer-out.c
+++ b/gcc/tree-streamer-out.c
@@ -102,10 +102,15 @@  pack_ts_base_value_fields (struct bitpack_d *bp, tree expr)
   bp_pack_value (bp, (TREE_CODE (expr) != SSA_NAME
 		      ? 0 : TREE_ASM_WRITTEN (expr)), 1);
   if (TYPE_P (expr))
-    bp_pack_value (bp, TYPE_ARTIFICIAL (expr), 1);
+    {
+      bp_pack_value (bp, TYPE_ARTIFICIAL (expr), 1);
+      bp_pack_value (bp, TYPE_CXX_LOCAL (expr), 1);
+    }
   else
-    bp_pack_value (bp, TREE_NO_WARNING (expr), 1);
-  bp_pack_value (bp, TREE_NOTHROW (expr), 1);
+    {
+      bp_pack_value (bp, TREE_NO_WARNING (expr), 1);
+      bp_pack_value (bp, TREE_NOTHROW (expr), 1);
+    }
   bp_pack_value (bp, TREE_STATIC (expr), 1);
   if (TREE_CODE (expr) != TREE_BINFO)
     bp_pack_value (bp, TREE_PRIVATE (expr), 1);
diff --git a/gcc/tree.h b/gcc/tree.h
index e5547e8491c..6842de7a24b 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -2254,6 +2254,11 @@  extern tree vector_element_bits_tree (const_tree);
 #define TYPE_FINAL_P(NODE) \
   (RECORD_OR_UNION_CHECK (NODE)->base.default_def_flag)
 
+/* If nonzero, it indicates that a C++ type (mainly polymorphic class) may
+   be referenced outside the program being compiled. This is used in C++
+   devirtualization.  */
+#define TYPE_CXX_LOCAL(NODE) (TYPE_CHECK (NODE)->base.nothrow_flag)
+
 /* The debug output functions use the symtab union field to store
    information specific to the debugging format.  The different debug
    output hooks store different types in the union field.  These three
-- 
2.17.1