[RFC,3/9] Add pass to prune unreachable parts of the ABI

Message ID 20210325215146.3597963-4-gprocida@google.com
State Superseded, archived
Headers
Series Utility to manipulate ABI XML |

Commit Message

Giuliano Procida March 25, 2021, 9:51 p.m. UTC
  In response to internal reports about very large ABIs for libraires
implemented in C++ but exposing only a C API, I wrote a script as a
means of investigating the issue. This has been adapted here.

The aim is to remove parts of the XML that are irrelevant to
abidiff. ELF symbols (and shared object dependencies) are taken to be
the graph roots for the purpose of reachability analysis.

On most XML files, this results in a modest saving (5-10%). However,
for the library that sparked the invesigaton, the resulting XML was
more than 2300 times smaller.

If this functionality is implemented in libagail tself, this pass will
become a no-op.

	* scripts/abitidy.pl (prune_unreachable): New function to
	determine the reachable declaration and type nodes in an XNL
	ABI and remove all unreachable ones.

Signed-off-by: Giuliano Procida <gprocida@google.com>
---
 scripts/abitidy.pl | 215 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 214 insertions(+), 1 deletion(-)
  

Patch

diff --git a/scripts/abitidy.pl b/scripts/abitidy.pl
index 1f74e267..c9f93ed8 100755
--- a/scripts/abitidy.pl
+++ b/scripts/abitidy.pl
@@ -140,23 +140,233 @@  sub drop_empty($node) {
   }
 }
 
+# Remove unreachable declarations and types.
+#
+# When making a graph from ABI XML, the following are the types of
+# "node" we care about. The "edges" are obtained from a few XML
+# attributes as well as via XML element containment.
+#
+#  ELF (exported) symbols
+#
+#   elf-symbol (has a name; the code here currently knows nothing
+#     about aliases)
+#
+#  Declarations (that mention a symbol)
+#
+#   These live in a scope. In C++ scopes can be nested and include
+#   namespaces and class types.
+#
+#  var-decl (also used for member variables)
+#    elf-symbol linked to symbol via mangled-name
+#    type-id links to a type
+#  function-decl (also used for member functions)
+#    elf-symbol linked to symbol via mangled-name
+#    parameter and return type-ids link to types
+#      (alas, not just a link to a function type)
+#
+#  Types
+#
+#   These occupy pretty much all the other elements, besides those
+#   that act as simple containers.
+sub prune_unreachable($dom) {
+  my %elf_symbols;
+  # Graph vertices (only needed for statistics).
+  my %vertices;
+  # Graph edges.
+  my %edges;
+
+  # Keep track of type / symbol nesting.
+  my @stack;
+
+  # Traverse the whole XML DOM.
+  my sub make_graph($node) {
+    # The XML attributes we care about.
+    my $name;
+    my $id;
+    my $type_id;
+    my $symbol;
+    my $naming_typedef_id;
+
+    # Not every node we encounter is an XML element.
+    if ($node->nodeType == XML_ELEMENT_NODE) {
+      $name = $node->getAttribute('name');
+      $id = $node->getAttribute('id');
+      $type_id = $node->getAttribute('type-id');
+      $symbol = $node->getAttribute('mangled-name');
+      $naming_typedef_id = $node->getAttribute('naming-typedef-id');
+      die if defined $id && defined $symbol;
+    }
+
+    if (defined $name && $node->getName() eq 'elf-symbol') {
+      $elf_symbols{$name} = undef;
+      # Early return is safe, but not necessary.
+      return;
+    }
+
+    if (defined $id) {
+      my $vertex = "type:$id";
+      # This element defines a type (but there may be more than one
+      # defining the same type - we cannot rely on uniqueness).
+      $vertices{$vertex} = undef;
+      if (defined $naming_typedef_id) {
+        # This is an odd one, there can be a backwards link from an
+        # anonymous type to the typedef that refers to it, so we need to
+        # pull in the typedef, even if nothing else refers to it.
+        $edges{$vertex}{"type:$naming_typedef_id"} = undef;
+      }
+      if (@stack) {
+        # Parent<->child dependencies; record dependencies both
+        # ways to avoid holes in XML types and declarations.
+        $edges{$stack[-1]}{$vertex} = undef;
+        $edges{$vertex}{$stack[-1]} = undef;
+      }
+      push @stack, $vertex;
+    }
+
+    if (defined $symbol) {
+      my $vertex = "symbol:$symbol";
+      # This element is a declaration linked to a symbol (whether or not
+      # exported).
+      $vertices{$vertex} = undef;
+      if (@stack) {
+        # Parent<->child dependencies; record dependencies both ways
+        # to avoid holes in XML types and declarations.
+        #
+        # Symbols exist outside of the type hierarchy, so choosing to
+        # make them depend on a containing type scope and vice versa
+        # is conservative and probably not necessary.
+        $edges{$stack[-1]}{$vertex} = undef;
+        $edges{$vertex}{$stack[-1]} = undef;
+      }
+      # The symbol depends on the types mentioned in this element, so
+      # record it.
+      push @stack, $vertex;
+      # In practice there will be at most one symbol on the stack; we
+      # could verify this here, but it wouldn't achieve anything.
+    }
+
+    if (defined $type_id) {
+      if (@stack) {
+        # The enclosing type or symbol refers to another type.
+        $edges{$stack[-1]}{"type:$type_id"} = undef;
+      }
+    }
+
+    for my $child ($node->childNodes()) {
+      __SUB__->($child);
+    }
+
+    if (defined $symbol) {
+      pop @stack;
+    }
+    if (defined $id) {
+      pop @stack;
+    }
+  }
+
+  # Build a graph.
+  make_graph($dom);
+  die if @stack;
+  #warn Dumper(\%elf_symbols, \%vertices, \%edges);
+
+  # DFS visited state. Would be nicer with a flat namespace of nodes.
+  my %seen;
+  my sub dfs($vertex) {
+    no warnings 'recursion';
+    return if exists $seen{$vertex};
+    $seen{$vertex} = undef;
+
+    my $tos = $edges{$vertex};
+    if (defined $tos) {
+      for my $to (keys %$tos) {
+        __SUB__->($to);
+      }
+    }
+  }
+
+  # Traverse the graph, starting from the exported symbols.
+  for my $symbol (keys %elf_symbols) {
+    my $vertex = "symbol:$symbol";
+    if (exists $vertices{$vertex}) {
+      dfs($vertex);
+    } else {
+      warn "no declaration found for ELF symbol $symbol\n";
+    }
+  }
+
+  #warn Dumper(\%seen);
+
+  # Useful counts.
+  my sub print_report() {
+    my $count_elf_symbols = scalar keys %elf_symbols;
+    my $count_vertices = scalar keys %vertices;
+    my $count_seen = scalar keys %seen;
+
+    warn qq{ELF = $count_elf_symbols
+vertices = $count_vertices
+seen = $count_seen
+};
+  }
+
+  #print_report();
+
+  # XPath selection is too slow as we end up enumerating lots of
+  # nested items whose preservation is entirely determined by their
+  # containing items. DFS with early stopping for the win.
+  my sub remove_unwanted($node) {
+    my $node_name = $node->getName();
+    my $name;
+    my $id;
+    my $symbol;
+
+    if ($node->nodeType == XML_ELEMENT_NODE) {
+      $name = $node->getAttribute('name');
+      $id = $node->getAttribute('id');
+      $symbol = $node->getAttribute('mangled-name');
+      die if defined $id && defined $symbol;
+    }
+
+    # Return if we know that this is a type or declaration to keep or
+    # drop in its entirety.
+    if (defined $id) {
+      remove_node($node) unless exists $seen{"type:$id"};
+      return;
+    }
+    if ($node_name eq 'var-decl' || $node_name eq 'function-decl') {
+      remove_node($node) unless defined $symbol && exists $seen{"symbol:$symbol"};
+      return;
+    }
+
+    # Otherwise, this is not a type, declaration or part thereof, so
+    # process child elements.
+    for my $child ($node->childNodes()) {
+      __SUB__->($child);
+    }
+  }
+
+  remove_unwanted($dom);
+}
+
 # Parse arguments.
 my $input_opt;
 my $output_opt;
 my $all_opt;
 my $drop_opt;
+my $prune_opt;
 GetOptions('i|input=s' => \$input_opt,
            'o|output=s' => \$output_opt,
            'a|all' => sub {
-             $drop_opt = 1
+             $drop_opt = $prune_opt = 1
            },
            'd|drop-empty!' => \$drop_opt,
+           'p|prune-unreachable!' => \$prune_opt,
   ) and !@ARGV or die("usage: $0",
                       map { (' ', $_) } (
                         '[-i|--input file]',
                         '[-o|--output file]',
                         '[-a|--all]',
                         '[-d|--[no-]drop-empty]',
+                        '[-p|--[no-]prune-unreachable]',
                       ), "\n");
 
 exit 0 unless defined $input_opt;
@@ -169,6 +379,9 @@  close $input;
 # This simplifies DOM analysis and manipulation.
 strip_text($dom);
 
+# Prune unreachable elements.
+prune_unreachable($dom) if $prune_opt;
+
 # Drop empty elements.
 drop_empty($dom) if $drop_opt;