x86: Speed up target attribute handling by using a cache

Message ID 20211122093607.GJ2646553@tucnak
State Committed
Headers
Series x86: Speed up target attribute handling by using a cache |

Commit Message

Jakub Jelinek Nov. 22, 2021, 9:36 a.m. UTC
  Hi!

The target attribute handling is very expensive and for the common case
from x86intrin.h where many functions get implicitly the same target
attribute, we can speed up compilation a lot by caching it.

The following patches both create a single entry cache, where they cache
for a particular target attribute argument list the resulting
DECL_FUNCTION_SPECIFIC_TARGET and DECL_FUNCTION_SPECIFIC_OPTIMIZATION
values from ix86_valid_target_attribute_p and use the cache if the
args are the same as last time and we start either from NULL values
of those, or from the recorded values for those from last time.

Compiling a simple:
 #include <x86intrin.h>

 int i;
testcase with ./cc1 -quiet -O2 -isystem include/ test.c
takes on my WS without the patches ~0.392s and with either of the
patches ~0.182s, i.e. roughly half the time as before.
For ./cc1plus -quiet -O2 -isystem include/ test.c
it is slightly worse, the speed up is from ~0.613s to ~0.403s.

The difference between the 2 patches is that the first one uses copy_list
while the second one uses a vec, so I think the second one has the advantage
of creating less GC garbage.
I've verified both patches achieve the same content of those
DECL_FUNCTION_SPECIFIC_TARGET and DECL_FUNCTION_SPECIFIC_OPTIMIZATION
nodes as before on x86intrin.h by doing debug_tree on those and comparing
the stderr from without these patches to with these patches.

Both patches were bootstrapped/regtested on x86_64-linux and i686-linux,
ok for trunk (and which one)?

2021-11-22  Jakub Jelinek  <jakub@redhat.com>

	* attribs.h (simple_cst_list_equal): Declare.
	* attribs.c (simple_cst_list_equal): No longer static.
	* config/i386/i386-options.c (target_attribute_cache): New variable.
	(ix86_valid_target_attribute_p): Cache DECL_FUNCTION_SPECIFIC_TARGET
	and DECL_FUNCTION_SPECIFIC_OPTIMIZATION based on args.


	Jakub
2021-11-22  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386-options.c (target_attribute_cache): New variable.
	(ix86_valid_target_attribute_p): Cache DECL_FUNCTION_SPECIFIC_TARGET
	and DECL_FUNCTION_SPECIFIC_OPTIMIZATION based on args.

--- gcc/config/i386/i386-options.c.jj	2021-11-20 23:53:35.730637746 +0100
+++ gcc/config/i386/i386-options.c	2021-11-21 18:13:16.948659255 +0100
@@ -1403,6 +1403,8 @@ ix86_valid_target_attribute_tree (tree f
   return t;
 }
 
+static GTY(()) vec<tree, va_gc> *target_attribute_cache;
+
 /* Hook to validate attribute((target("string"))).  */
 
 bool
@@ -1423,6 +1425,28 @@ ix86_valid_target_attribute_p (tree fnde
       && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
     return true;
 
+  if (target_attribute_cache
+      && (DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == (*target_attribute_cache)[0]
+	  || DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE)
+      && (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
+	  == (*target_attribute_cache)[1]
+	  || DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE))
+    {
+      tree a;
+      unsigned int n = vec_safe_length (target_attribute_cache);
+      unsigned int i = 2;
+      for (a = args; a && i < n; a = TREE_CHAIN (a), ++i)
+	if (simple_cst_equal ((*target_attribute_cache)[i], TREE_VALUE (a)) != 1)
+	  break;
+      if (a == NULL_TREE && i == n)
+	{
+	  DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = (*target_attribute_cache)[0];
+	  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
+	    = (*target_attribute_cache)[1];
+	  return true;
+	}
+    }
+
   tree old_optimize = build_optimization_node (&global_options,
 					       &global_options_set);
 
@@ -1459,8 +1483,19 @@ ix86_valid_target_attribute_p (tree fnde
   if (new_target == error_mark_node)
     ret = false;
 
-  else if (fndecl && new_target)
+  else if (new_target)
     {
+      if (DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE
+	  && DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
+	{
+	  vec_safe_truncate (target_attribute_cache, 0);
+	  vec_safe_push (target_attribute_cache, new_target);
+	  vec_safe_push (target_attribute_cache, old_optimize != new_optimize
+						 ? new_optimize : NULL_TREE);
+	  for (tree a = args; a; a = TREE_CHAIN (a))
+	    vec_safe_push (target_attribute_cache, TREE_VALUE (a));
+	}
+
       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
 
       if (old_optimize != new_optimize)
  

Comments

Martin Liška Nov. 22, 2021, 1:03 p.m. UTC | #1
On 11/22/21 10:36, Jakub Jelinek via Gcc-patches wrote:
> Hi!
> 
> The target attribute handling is very expensive and for the common case
> from x86intrin.h where many functions get implicitly the same target
> attribute, we can speed up compilation a lot by caching it.
> 
> The following patches both create a single entry cache, where they cache
> for a particular target attribute argument list the resulting
> DECL_FUNCTION_SPECIFIC_TARGET and DECL_FUNCTION_SPECIFIC_OPTIMIZATION
> values from ix86_valid_target_attribute_p and use the cache if the
> args are the same as last time and we start either from NULL values
> of those, or from the recorded values for those from last time.
> 
> Compiling a simple:
>   #include <x86intrin.h>
> 
>   int i;
> testcase with ./cc1 -quiet -O2 -isystem include/ test.c
> takes on my WS without the patches ~0.392s and with either of the
> patches ~0.182s, i.e. roughly half the time as before.
> For ./cc1plus -quiet -O2 -isystem include/ test.c
> it is slightly worse, the speed up is from ~0.613s to ~0.403s.
> 
> The difference between the 2 patches is that the first one uses copy_list
> while the second one uses a vec, so I think the second one has the advantage
> of creating less GC garbage.

Hello.

I see only one patch attached, Jakub. Can you please send also the second one?

> I've verified both patches achieve the same content of those
> DECL_FUNCTION_SPECIFIC_TARGET and DECL_FUNCTION_SPECIFIC_OPTIMIZATION
> nodes as before on x86intrin.h by doing debug_tree on those and comparing
> the stderr from without these patches to with these patches.
> 
> Both patches were bootstrapped/regtested on x86_64-linux and i686-linux,
> ok for trunk (and which one)?
> 
> 2021-11-22  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* attribs.h (simple_cst_list_equal): Declare.
> 	* attribs.c (simple_cst_list_equal): No longer static.
> 	* config/i386/i386-options.c (target_attribute_cache): New variable.
> 	(ix86_valid_target_attribute_p): Cache DECL_FUNCTION_SPECIFIC_TARGET
> 	and DECL_FUNCTION_SPECIFIC_OPTIMIZATION based on args.
> 
> --- gcc/attribs.h.jj	2021-11-11 14:35:37.442350841 +0100
> +++ gcc/attribs.h	2021-11-19 11:52:08.843252645 +0100
> @@ -60,6 +60,7 @@ extern tree build_type_attribute_variant
>   extern tree build_decl_attribute_variant (tree, tree);
>   extern tree build_type_attribute_qual_variant (tree, tree, int);
>   
> +extern bool simple_cst_list_equal (const_tree, const_tree);
>   extern bool attribute_value_equal (const_tree, const_tree);
>   
>   /* Return 0 if the attributes for two types are incompatible, 1 if they
> --- gcc/attribs.c.jj	2021-11-11 14:35:37.442350841 +0100
> +++ gcc/attribs.c	2021-11-19 11:51:43.473615692 +0100
> @@ -1290,7 +1290,7 @@ cmp_attrib_identifiers (const_tree attr1
>   /* Compare two constructor-element-type constants.  Return 1 if the lists
>      are known to be equal; otherwise return 0.  */
>   
> -static bool
> +bool
>   simple_cst_list_equal (const_tree l1, const_tree l2)
>   {
>     while (l1 != NULL_TREE && l2 != NULL_TREE)
> --- gcc/config/i386/i386-options.c.jj	2021-11-15 13:19:07.347900863 +0100
> +++ gcc/config/i386/i386-options.c	2021-11-20 00:27:32.919505947 +0100
> @@ -1403,6 +1403,8 @@ ix86_valid_target_attribute_tree (tree f
>     return t;
>   }
>   
> +static GTY(()) tree target_attribute_cache[3];

I would come up with a struct that would wrap the 3 trees as
target_attribute_cache[index] accessing is not much intuitive.

Thanks,
Martin

> +
>   /* Hook to validate attribute((target("string"))).  */
>   
>   bool
> @@ -1423,6 +1425,19 @@ ix86_valid_target_attribute_p (tree fnde
>         && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
>       return true;
>   
> +  if ((DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == target_attribute_cache[1]
> +       || DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE)
> +      && (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
> +	  == target_attribute_cache[2]
> +	  || DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
> +      && simple_cst_list_equal (args, target_attribute_cache[0]))
> +    {
> +      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_attribute_cache[1];
> +      DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
> +	= target_attribute_cache[2];
> +      return true;
> +    }
> +
>     tree old_optimize = build_optimization_node (&global_options,
>   					       &global_options_set);
>   
> @@ -1456,8 +1471,17 @@ ix86_valid_target_attribute_p (tree fnde
>     if (new_target == error_mark_node)
>       ret = false;
>   
> -  else if (fndecl && new_target)
> +  else if (new_target)
>       {
> +      if (DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE
> +	  && DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
> +	{
> +	  target_attribute_cache[0] = copy_list (args);
> +	  target_attribute_cache[1] = new_target;
> +	  target_attribute_cache[2]
> +	    = old_optimize != new_optimize ? new_optimize : NULL_TREE;
> +	}
> +
>         DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
>   
>         if (old_optimize != new_optimize)
> 
> 	Jakub
>
  
Jakub Jelinek Nov. 22, 2021, 1:05 p.m. UTC | #2
On Mon, Nov 22, 2021 at 02:03:19PM +0100, Martin Liška wrote:
> I see only one patch attached, Jakub. Can you please send also the second one?

The first one has been inlined in the mail body (the one with attribs.[ch]
changes), the second one has been attached (the one without that).

	Jakub
  
Martin Liška Nov. 22, 2021, 1:10 p.m. UTC | #3
On 11/22/21 14:05, Jakub Jelinek wrote:
> On Mon, Nov 22, 2021 at 02:03:19PM +0100, Martin Liška wrote:
>> I see only one patch attached, Jakub. Can you please send also the second one?
> 
> The first one has been inlined in the mail body (the one with attribs.[ch]
> changes), the second one has been attached (the one without that).
> 
> 	Jakub
> 

Ah, sorry, I see. I prefer the copy_list version as it achieves more readable code.

Martin
  
Jakub Jelinek Nov. 22, 2021, 1:17 p.m. UTC | #4
On Mon, Nov 22, 2021 at 02:10:08PM +0100, Martin Liška wrote:
> On 11/22/21 14:05, Jakub Jelinek wrote:
> > On Mon, Nov 22, 2021 at 02:03:19PM +0100, Martin Liška wrote:
> > > I see only one patch attached, Jakub. Can you please send also the second one?
> > 
> > The first one has been inlined in the mail body (the one with attribs.[ch]
> > changes), the second one has been attached (the one without that).
> > 
> > 	Jakub
> > 
> 
> Ah, sorry, I see. I prefer the copy_list version as it achieves more readable code.

But it will allocate fresh memory on each target attribute that isn't
cached.
Perhaps we could add free_list that would free_node all the TREE_LISTs in a
chain.

	Jakub
  
Uros Bizjak Dec. 1, 2021, 2:58 p.m. UTC | #5
On Mon, Nov 22, 2021 at 10:36 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The target attribute handling is very expensive and for the common case
> from x86intrin.h where many functions get implicitly the same target
> attribute, we can speed up compilation a lot by caching it.
>
> The following patches both create a single entry cache, where they cache
> for a particular target attribute argument list the resulting
> DECL_FUNCTION_SPECIFIC_TARGET and DECL_FUNCTION_SPECIFIC_OPTIMIZATION
> values from ix86_valid_target_attribute_p and use the cache if the
> args are the same as last time and we start either from NULL values
> of those, or from the recorded values for those from last time.
>
> Compiling a simple:
>  #include <x86intrin.h>
>
>  int i;
> testcase with ./cc1 -quiet -O2 -isystem include/ test.c
> takes on my WS without the patches ~0.392s and with either of the
> patches ~0.182s, i.e. roughly half the time as before.
> For ./cc1plus -quiet -O2 -isystem include/ test.c
> it is slightly worse, the speed up is from ~0.613s to ~0.403s.
>
> The difference between the 2 patches is that the first one uses copy_list
> while the second one uses a vec, so I think the second one has the advantage
> of creating less GC garbage.
> I've verified both patches achieve the same content of those
> DECL_FUNCTION_SPECIFIC_TARGET and DECL_FUNCTION_SPECIFIC_OPTIMIZATION
> nodes as before on x86intrin.h by doing debug_tree on those and comparing
> the stderr from without these patches to with these patches.
>
> Both patches were bootstrapped/regtested on x86_64-linux and i686-linux,
> ok for trunk (and which one)?
>
> 2021-11-22  Jakub Jelinek  <jakub@redhat.com>
>
>         * attribs.h (simple_cst_list_equal): Declare.
>         * attribs.c (simple_cst_list_equal): No longer static.
>         * config/i386/i386-options.c (target_attribute_cache): New variable.
>         (ix86_valid_target_attribute_p): Cache DECL_FUNCTION_SPECIFIC_TARGET
>         and DECL_FUNCTION_SPECIFIC_OPTIMIZATION based on args.

LGTM, but really I'll just rubber-stamp the patch.

Uros.

> --- gcc/attribs.h.jj    2021-11-11 14:35:37.442350841 +0100
> +++ gcc/attribs.h       2021-11-19 11:52:08.843252645 +0100
> @@ -60,6 +60,7 @@ extern tree build_type_attribute_variant
>  extern tree build_decl_attribute_variant (tree, tree);
>  extern tree build_type_attribute_qual_variant (tree, tree, int);
>
> +extern bool simple_cst_list_equal (const_tree, const_tree);
>  extern bool attribute_value_equal (const_tree, const_tree);
>
>  /* Return 0 if the attributes for two types are incompatible, 1 if they
> --- gcc/attribs.c.jj    2021-11-11 14:35:37.442350841 +0100
> +++ gcc/attribs.c       2021-11-19 11:51:43.473615692 +0100
> @@ -1290,7 +1290,7 @@ cmp_attrib_identifiers (const_tree attr1
>  /* Compare two constructor-element-type constants.  Return 1 if the lists
>     are known to be equal; otherwise return 0.  */
>
> -static bool
> +bool
>  simple_cst_list_equal (const_tree l1, const_tree l2)
>  {
>    while (l1 != NULL_TREE && l2 != NULL_TREE)
> --- gcc/config/i386/i386-options.c.jj   2021-11-15 13:19:07.347900863 +0100
> +++ gcc/config/i386/i386-options.c      2021-11-20 00:27:32.919505947 +0100
> @@ -1403,6 +1403,8 @@ ix86_valid_target_attribute_tree (tree f
>    return t;
>  }
>
> +static GTY(()) tree target_attribute_cache[3];
> +
>  /* Hook to validate attribute((target("string"))).  */
>
>  bool
> @@ -1423,6 +1425,19 @@ ix86_valid_target_attribute_p (tree fnde
>        && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
>      return true;
>
> +  if ((DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == target_attribute_cache[1]
> +       || DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE)
> +      && (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
> +         == target_attribute_cache[2]
> +         || DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
> +      && simple_cst_list_equal (args, target_attribute_cache[0]))
> +    {
> +      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_attribute_cache[1];
> +      DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
> +       = target_attribute_cache[2];
> +      return true;
> +    }
> +
>    tree old_optimize = build_optimization_node (&global_options,
>                                                &global_options_set);
>
> @@ -1456,8 +1471,17 @@ ix86_valid_target_attribute_p (tree fnde
>    if (new_target == error_mark_node)
>      ret = false;
>
> -  else if (fndecl && new_target)
> +  else if (new_target)
>      {
> +      if (DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE
> +         && DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
> +       {
> +         target_attribute_cache[0] = copy_list (args);
> +         target_attribute_cache[1] = new_target;
> +         target_attribute_cache[2]
> +           = old_optimize != new_optimize ? new_optimize : NULL_TREE;
> +       }
> +
>        DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
>
>        if (old_optimize != new_optimize)
>
>         Jakub
  

Patch

--- gcc/attribs.h.jj	2021-11-11 14:35:37.442350841 +0100
+++ gcc/attribs.h	2021-11-19 11:52:08.843252645 +0100
@@ -60,6 +60,7 @@  extern tree build_type_attribute_variant
 extern tree build_decl_attribute_variant (tree, tree);
 extern tree build_type_attribute_qual_variant (tree, tree, int);
 
+extern bool simple_cst_list_equal (const_tree, const_tree);
 extern bool attribute_value_equal (const_tree, const_tree);
 
 /* Return 0 if the attributes for two types are incompatible, 1 if they
--- gcc/attribs.c.jj	2021-11-11 14:35:37.442350841 +0100
+++ gcc/attribs.c	2021-11-19 11:51:43.473615692 +0100
@@ -1290,7 +1290,7 @@  cmp_attrib_identifiers (const_tree attr1
 /* Compare two constructor-element-type constants.  Return 1 if the lists
    are known to be equal; otherwise return 0.  */
 
-static bool
+bool
 simple_cst_list_equal (const_tree l1, const_tree l2)
 {
   while (l1 != NULL_TREE && l2 != NULL_TREE)
--- gcc/config/i386/i386-options.c.jj	2021-11-15 13:19:07.347900863 +0100
+++ gcc/config/i386/i386-options.c	2021-11-20 00:27:32.919505947 +0100
@@ -1403,6 +1403,8 @@  ix86_valid_target_attribute_tree (tree f
   return t;
 }
 
+static GTY(()) tree target_attribute_cache[3];
+
 /* Hook to validate attribute((target("string"))).  */
 
 bool
@@ -1423,6 +1425,19 @@  ix86_valid_target_attribute_p (tree fnde
       && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
     return true;
 
+  if ((DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == target_attribute_cache[1]
+       || DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE)
+      && (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
+	  == target_attribute_cache[2]
+	  || DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
+      && simple_cst_list_equal (args, target_attribute_cache[0]))
+    {
+      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_attribute_cache[1];
+      DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl)
+	= target_attribute_cache[2];
+      return true;
+    }
+
   tree old_optimize = build_optimization_node (&global_options,
 					       &global_options_set);
 
@@ -1456,8 +1471,17 @@  ix86_valid_target_attribute_p (tree fnde
   if (new_target == error_mark_node)
     ret = false;
 
-  else if (fndecl && new_target)
+  else if (new_target)
     {
+      if (DECL_FUNCTION_SPECIFIC_TARGET (fndecl) == NULL_TREE
+	  && DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) == NULL_TREE)
+	{
+	  target_attribute_cache[0] = copy_list (args);
+	  target_attribute_cache[1] = new_target;
+	  target_attribute_cache[2]
+	    = old_optimize != new_optimize ? new_optimize : NULL_TREE;
+	}
+
       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
 
       if (old_optimize != new_optimize)