diff mbox series

vect: Remove vec_outside/inside_cost fields

Message ID mpt1r3r2aic.fsf@arm.com
State Committed
Commit 772d76acb5aead98eb3c47a78363d867287d5e77
Headers show
Series vect: Remove vec_outside/inside_cost fields | expand

Commit Message

Richard Sandiford Nov. 8, 2021, 10:43 a.m. UTC
The vector costs now use a common base class instead of being
completely abstract.  This means that there's no longer a
need to record the inside and outside costs separately.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
	* tree-vectorizer.h (_loop_vec_info): Remove vec_outside_cost
	and vec_inside_cost.
	(vector_costs::outside_cost): New function.
	* tree-vectorizer.c (_loop_vec_info::_loop_vec_info): Update
	after above.
	(vect_estimate_min_profitable_iters): Likewise.
	(vect_better_loop_vinfo_p): Get the inside and outside costs
	from the loop_vec_infos' vector_costs.
---
 gcc/tree-vect-loop.c  | 24 ++++++++++--------------
 gcc/tree-vectorizer.h | 16 +++++++++-------
 2 files changed, 19 insertions(+), 21 deletions(-)

Comments

Richard Biener Nov. 8, 2021, 10:51 a.m. UTC | #1
On Mon, Nov 8, 2021 at 11:44 AM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The vector costs now use a common base class instead of being
> completely abstract.  This means that there's no longer a
> need to record the inside and outside costs separately.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

Richard.

> Richard
>
>
> gcc/
>         * tree-vectorizer.h (_loop_vec_info): Remove vec_outside_cost
>         and vec_inside_cost.
>         (vector_costs::outside_cost): New function.
>         * tree-vectorizer.c (_loop_vec_info::_loop_vec_info): Update
>         after above.
>         (vect_estimate_min_profitable_iters): Likewise.
>         (vect_better_loop_vinfo_p): Get the inside and outside costs
>         from the loop_vec_infos' vector_costs.
> ---
>  gcc/tree-vect-loop.c  | 24 ++++++++++--------------
>  gcc/tree-vectorizer.h | 16 +++++++++-------
>  2 files changed, 19 insertions(+), 21 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index b6a631d4384..dd4a363fee5 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -840,8 +840,6 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
>      scan_map (NULL),
>      slp_unrolling_factor (1),
>      single_scalar_iteration_cost (0),
> -    vec_outside_cost (0),
> -    vec_inside_cost (0),
>      inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
>      vectorizable (false),
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
> @@ -2845,10 +2843,10 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
>        /* Compute the costs by multiplying the inside costs with the factor and
>          add the outside costs for a more complete picture.  The factor is the
>          amount of times we are expecting to iterate this epilogue.  */
> -      old_cost = old_loop_vinfo->vec_inside_cost * old_factor;
> -      new_cost = new_loop_vinfo->vec_inside_cost * new_factor;
> -      old_cost += old_loop_vinfo->vec_outside_cost;
> -      new_cost += new_loop_vinfo->vec_outside_cost;
> +      old_cost = old_loop_vinfo->vector_costs->body_cost () * old_factor;
> +      new_cost = new_loop_vinfo->vector_costs->body_cost () * new_factor;
> +      old_cost += old_loop_vinfo->vector_costs->outside_cost ();
> +      new_cost += new_loop_vinfo->vector_costs->outside_cost ();
>        return new_cost < old_cost;
>      }
>
> @@ -2865,8 +2863,8 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
>
>    /* Check whether the (fractional) cost per scalar iteration is lower
>       or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
> -  poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
> -  poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
> +  poly_int64 rel_new = new_loop_vinfo->vector_costs->body_cost () * old_vf;
> +  poly_int64 rel_old = old_loop_vinfo->vector_costs->body_cost () * new_vf;
>
>    HOST_WIDE_INT est_rel_new_min
>      = estimated_poly_value (rel_new, POLY_VALUE_MIN);
> @@ -2918,8 +2916,10 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
>
>    /* If there's nothing to choose between the loop bodies, see whether
>       there's a difference in the prologue and epilogue costs.  */
> -  if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
> -    return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
> +  auto old_outside_cost = old_loop_vinfo->vector_costs->outside_cost ();
> +  auto new_outside_cost = new_loop_vinfo->vector_costs->outside_cost ();
> +  if (new_outside_cost != old_outside_cost)
> +    return new_outside_cost < old_outside_cost;
>
>    return false;
>  }
> @@ -4272,10 +4272,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>
>    vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
>
> -  /* Stash the costs so that we can compare two loop_vec_infos.  */
> -  loop_vinfo->vec_inside_cost = vec_inside_cost;
> -  loop_vinfo->vec_outside_cost = vec_outside_cost;
> -
>    if (dump_enabled_p ())
>      {
>        dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 1cd6cc036f2..87d3f211a2e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -724,13 +724,6 @@ public:
>    /* Cost of a single scalar iteration.  */
>    int single_scalar_iteration_cost;
>
> -  /* The cost of the vector prologue and epilogue, including peeled
> -     iterations and set-up code.  */
> -  int vec_outside_cost;
> -
> -  /* The cost of the vector loop body.  */
> -  int vec_inside_cost;
> -
>    /* The factor used to over weight those statements in an inner loop
>       relative to the loop being vectorized.  */
>    unsigned int inner_loop_cost_factor;
> @@ -1429,6 +1422,7 @@ public:
>    unsigned int prologue_cost () const;
>    unsigned int body_cost () const;
>    unsigned int epilogue_cost () const;
> +  unsigned int outside_cost () const;
>
>  protected:
>    unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location,
> @@ -1489,6 +1483,14 @@ vector_costs::epilogue_cost () const
>    return m_costs[vect_epilogue];
>  }
>
> +/* Return the cost of the prologue and epilogue code (in abstract units).  */
> +
> +inline unsigned int
> +vector_costs::outside_cost () const
> +{
> +  return prologue_cost () + epilogue_cost ();
> +}
> +
>  #define VECT_MAX_COST 1000
>
>  /* The maximum number of intermediate steps required in multi-step type
> --
> 2.25.1
>
Martin Liška Nov. 10, 2021, 4:47 p.m. UTC | #2
On 11/8/21 11:43, Richard Sandiford via Gcc-patches wrote:
> |Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?|

I think the patch causes the following on x86_64-linux-gnu:
FAIL: gfortran.dg/inline_matmul_17.f90   -O   scan-tree-dump-times optimized "matmul_r4" 2

Martin
Richard Sandiford Nov. 10, 2021, 5:18 p.m. UTC | #3
Martin Liška <mliska@suse.cz> writes:
> On 11/8/21 11:43, Richard Sandiford via Gcc-patches wrote:
>> |Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?|
>
> I think the patch causes the following on x86_64-linux-gnu:
> FAIL: gfortran.dg/inline_matmul_17.f90   -O   scan-tree-dump-times optimized "matmul_r4" 2

I get that failure even with d70ef65692f (from before the patches
I committed today).

Thanks,
Richard
Martin Liška Nov. 11, 2021, 8:45 a.m. UTC | #4
On 11/10/21 18:18, Richard Sandiford wrote:
> Martin Liška <mliska@suse.cz> writes:
>> On 11/8/21 11:43, Richard Sandiford via Gcc-patches wrote:
>>> |Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?|
>>
>> I think the patch causes the following on x86_64-linux-gnu:
>> FAIL: gfortran.dg/inline_matmul_17.f90   -O   scan-tree-dump-times optimized "matmul_r4" 2
> 
> I get that failure even with d70ef65692f (from before the patches
> I committed today).

Sorry, you are right, it's one revision before:
d70ef65692fced7ab72e0aceeff7407e5a34d96d

Honza, can you please take a look?

Cheers,
Martin

> 
> Thanks,
> Richard
>
Jan Hubicka Nov. 11, 2021, 9:44 a.m. UTC | #5
> > > 
> > > I think the patch causes the following on x86_64-linux-gnu:
> > > FAIL: gfortran.dg/inline_matmul_17.f90   -O   scan-tree-dump-times optimized "matmul_r4" 2
> > 
> > I get that failure even with d70ef65692f (from before the patches
> > I committed today).
> 
> Sorry, you are right, it's one revision before:
> d70ef65692fced7ab72e0aceeff7407e5a34d96d
> 
> Honza, can you please take a look?
The test looks for matmul_r4 calls which we now optimize out in fre1.
This is because alias info is better now

  afunc (&__var_5_mma);                                                         
  _188 = __var_5_mma.dim[0].ubound;                                             
  _189 = __var_5_mma.dim[0].lbound;                                             
  _190 = _188 - _189;                                                           
  _191 = _190 + 1;                                                              
  _192 = MAX_EXPR <_191, 0>;                                                    
  _193 = (real(kind=4)) _192;                                                   
  _194 = __var_5_mma.dim[1].ubound;                                             
  _195 = __var_5_mma.dim[1].lbound;                                             
  _196 = _194 - _195;                                                           
  _197 = _196 + 1;                                                              
  _198 = MAX_EXPR <_197, 0>;                                                    
  _199 = (real(kind=4)) _198;                                                   
  _200 = _193 * _199;                                                           
  _201 = _200 * 3.0e+0;                                                         
  if (_201 <= 1.0e+9)                                                           
    goto <bb 76>; [INV]                                                         
  else                                                                          
    goto <bb 87>; [INV]                                                         
  <bb 76> :                                                                     
  c = {};                                                                       


  afunc (&__var_5_mma);                                                         
  c = {};                                                                       

Now afunc writes to __var_5_mma only indirectly so I think it is correct that
we optimize the conditional out.

Easy fix would be to add -fno-ipa-modref, but perhaps someone with
better understanding of Fortran would help me to improve the testcase so
the calls to matmul_r4 remains reachable?

Honza
Richard Biener Nov. 11, 2021, 10:09 a.m. UTC | #6
On Thu, Nov 11, 2021 at 10:45 AM Jan Hubicka via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> > > >
> > > > I think the patch causes the following on x86_64-linux-gnu:
> > > > FAIL: gfortran.dg/inline_matmul_17.f90   -O   scan-tree-dump-times optimized "matmul_r4" 2
> > >
> > > I get that failure even with d70ef65692f (from before the patches
> > > I committed today).
> >
> > Sorry, you are right, it's one revision before:
> > d70ef65692fced7ab72e0aceeff7407e5a34d96d
> >
> > Honza, can you please take a look?
> The test looks for matmul_r4 calls which we now optimize out in fre1.
> This is because alias info is better now
>
>   afunc (&__var_5_mma);
>   _188 = __var_5_mma.dim[0].ubound;
>   _189 = __var_5_mma.dim[0].lbound;
>   _190 = _188 - _189;
>   _191 = _190 + 1;
>   _192 = MAX_EXPR <_191, 0>;
>   _193 = (real(kind=4)) _192;
>   _194 = __var_5_mma.dim[1].ubound;
>   _195 = __var_5_mma.dim[1].lbound;
>   _196 = _194 - _195;
>   _197 = _196 + 1;
>   _198 = MAX_EXPR <_197, 0>;
>   _199 = (real(kind=4)) _198;
>   _200 = _193 * _199;
>   _201 = _200 * 3.0e+0;
>   if (_201 <= 1.0e+9)
>     goto <bb 76>; [INV]
>   else
>     goto <bb 87>; [INV]
>   <bb 76> :
>   c = {};
>
>
>   afunc (&__var_5_mma);
>   c = {};
>
> Now afunc writes to __var_5_mma only indirectly so I think it is correct that
> we optimize the conditional out.
>
> Easy fix would be to add -fno-ipa-modref, but perhaps someone with
> better understanding of Fortran would help me to improve the testcase so
> the calls to matmul_r4 remains reachable?

I think the two matmul_r4 cases were missed optimizations before so just
changing the expected number of calls to zero is the correct fix here.  Indeed
we can now statically determine the matrices are not large and so only
keep the inline copy.

Richard.

>
> Honza
Jan Hubicka Nov. 11, 2021, 5:54 p.m. UTC | #7
> > Now afunc writes to __var_5_mma only indirectly so I think it is correct that
> > we optimize the conditional out.
> >
> > Easy fix would be to add -fno-ipa-modref, but perhaps someone with
> > better understanding of Fortran would help me to improve the testcase so
> > the calls to matmul_r4 remains reachable?
> 
> I think the two matmul_r4 cases were missed optimizations before so just
> changing the expected number of calls to zero is the correct fix here.  Indeed
> we can now statically determine the matrices are not large and so only
> keep the inline copy.

I have updated the matmul as follows.

gcc/testsuite/ChangeLog:

2021-11-11  Jan Hubicka  <hubicka@ucw.cz>

	* gfortran.dg/inline_matmul_17.f90: Fix template

diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_17.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_17.f90
index d2ca8e2948a..cff4b6ce5e2 100644
--- a/gcc/testsuite/gfortran.dg/inline_matmul_17.f90
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_17.f90
@@ -45,4 +45,4 @@ program main
   c = matmul(a, bfunc())
   if (any(c-d /= 0)) STOP 6
 end program main
-! { dg-final { scan-tree-dump-times "matmul_r4" 2 "optimized" } }
+! { dg-final { scan-tree-dump-not "matmul_r4" "optimized" } }
diff mbox series

Patch

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b6a631d4384..dd4a363fee5 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -840,8 +840,6 @@  _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     scan_map (NULL),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
-    vec_outside_cost (0),
-    vec_inside_cost (0),
     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
     vectorizable (false),
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
@@ -2845,10 +2843,10 @@  vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
       /* Compute the costs by multiplying the inside costs with the factor and
 	 add the outside costs for a more complete picture.  The factor is the
 	 amount of times we are expecting to iterate this epilogue.  */
-      old_cost = old_loop_vinfo->vec_inside_cost * old_factor;
-      new_cost = new_loop_vinfo->vec_inside_cost * new_factor;
-      old_cost += old_loop_vinfo->vec_outside_cost;
-      new_cost += new_loop_vinfo->vec_outside_cost;
+      old_cost = old_loop_vinfo->vector_costs->body_cost () * old_factor;
+      new_cost = new_loop_vinfo->vector_costs->body_cost () * new_factor;
+      old_cost += old_loop_vinfo->vector_costs->outside_cost ();
+      new_cost += new_loop_vinfo->vector_costs->outside_cost ();
       return new_cost < old_cost;
     }
 
@@ -2865,8 +2863,8 @@  vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* Check whether the (fractional) cost per scalar iteration is lower
      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
-  poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
-  poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
+  poly_int64 rel_new = new_loop_vinfo->vector_costs->body_cost () * old_vf;
+  poly_int64 rel_old = old_loop_vinfo->vector_costs->body_cost () * new_vf;
 
   HOST_WIDE_INT est_rel_new_min
     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
@@ -2918,8 +2916,10 @@  vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* If there's nothing to choose between the loop bodies, see whether
      there's a difference in the prologue and epilogue costs.  */
-  if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
-    return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
+  auto old_outside_cost = old_loop_vinfo->vector_costs->outside_cost ();
+  auto new_outside_cost = new_loop_vinfo->vector_costs->outside_cost ();
+  if (new_outside_cost != old_outside_cost)
+    return new_outside_cost < old_outside_cost;
 
   return false;
 }
@@ -4272,10 +4272,6 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
 
-  /* Stash the costs so that we can compare two loop_vec_infos.  */
-  loop_vinfo->vec_inside_cost = vec_inside_cost;
-  loop_vinfo->vec_outside_cost = vec_outside_cost;
-
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1cd6cc036f2..87d3f211a2e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -724,13 +724,6 @@  public:
   /* Cost of a single scalar iteration.  */
   int single_scalar_iteration_cost;
 
-  /* The cost of the vector prologue and epilogue, including peeled
-     iterations and set-up code.  */
-  int vec_outside_cost;
-
-  /* The cost of the vector loop body.  */
-  int vec_inside_cost;
-
   /* The factor used to over weight those statements in an inner loop
      relative to the loop being vectorized.  */
   unsigned int inner_loop_cost_factor;
@@ -1429,6 +1422,7 @@  public:
   unsigned int prologue_cost () const;
   unsigned int body_cost () const;
   unsigned int epilogue_cost () const;
+  unsigned int outside_cost () const;
 
 protected:
   unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location,
@@ -1489,6 +1483,14 @@  vector_costs::epilogue_cost () const
   return m_costs[vect_epilogue];
 }
 
+/* Return the cost of the prologue and epilogue code (in abstract units).  */
+
+inline unsigned int
+vector_costs::outside_cost () const
+{
+  return prologue_cost () + epilogue_cost ();
+}
+
 #define VECT_MAX_COST 1000
 
 /* The maximum number of intermediate steps required in multi-step type