[8/8] fortran: Continue MINLOC/MAXLOC second loop where the first stopped [PR90608]

Message ID 20240731200735.229898-9-morin-mikael@orange.fr
State New
Headers
Series fortran: Inline MINLOC/MAXLOC without DIM argument [PR90608] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed

Commit Message

Mikael Morin July 31, 2024, 8:07 p.m. UTC
  From: Mikael Morin <mikael@gcc.gnu.org>

Regression-tested on x86_64-pc-linux-gnu.
OK for master?

-- >8 --

Continue the second set of loops where the first one stopped in the
generated inline MINLOC/MAXLOC code in the cases where the generated code
contains two sets of loops.  This fixes a regression that was introduced
when enabling the generation of inline MINLOC/MAXLOC code with ARRAY of rank
greater than 1, no DIM argument, and either non-scalar MASK or floating-
point ARRAY.

In the cases where two sets of loops are generated as inline MINLOC/MAXLOC
code, we previously generated code such as (for rank 2 ARRAY, so with two
levels of nesting):

	for (idx11 in lower1..upper1)
	  {
	    for (idx12 in lower2..upper2)
	      {
	        ...
	        if (...)
	          {
	            ...
	            goto second_loop;
	          }
	      }
	  }
	second_loop:
	for (idx21 in lower1..upper1)
	  {
	    for (idx22 in lower2..upper2)
	      {
	        ...
	      }
	  }

which means we process the first elements twice, once in the first set
of loops and once in the second one.  This change avoids this duplicate
processing by using a conditional as lower bound for the second set of
loops, generating code like:

	second_loop_entry = false;
	for (idx11 in lower1..upper1)
	  {
	    for (idx12 in lower2..upper2)
	      {
	        ...
	        if (...)
	          {
	            ...
	            second_loop_entry = true;
	            goto second_loop;
	          }
	      }
	  }
	second_loop:
	for (idx21 in (second_loop_entry ? idx11 : lower1)..upper1)
	  {
	    for (idx22 in (second_loop_entry ? idx12 : lower2)..upper2)
	      {
	        ...
	        second_loop_entry = false;
	      }
	  }

It was expected that the compiler optimizations would be able to remove the
state variable second_loop_entry.  It is the case if ARRAY has rank 1 (so
without loop nesting), the variable is removed and the loop bounds become
unconditional, which restores previously generated code, fully fixing the
regression.  For larger rank, unfortunately, the state variable and
conditional loop bounds remain, but those cases were previously using
library calls, so it's not a regression.

	PR fortran/90608

gcc/fortran/ChangeLog:

	* trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Generate a set
	of index variables.  Set them using the loop indexes before leaving
	the first set of loops.  Generate a new loop entry predicate.
	Initialize it.  Set it before leaving the first set of loops.  Clear
	it in the body of the second set of loops.  For the second set of
	loops, update each loop lower bound to use the corresponding index
	variable if the predicate variable is set.
---
 gcc/fortran/trans-intrinsic.cc | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)
  

Patch

diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc
index 3a6a73d4241..89134b1190b 100644
--- a/gcc/fortran/trans-intrinsic.cc
+++ b/gcc/fortran/trans-intrinsic.cc
@@ -5342,6 +5342,7 @@  strip_kind_from_actual (gfc_actual_arglist * actual)
 	 pos0 = 0;
 	 pos1 = 0;
 	 S1 = from1;
+	 second_loop_entry = false;
 	 while (S1 <= to1) {
 	   S0 = from0;
 	   while (s0 <= to0 {
@@ -5354,6 +5355,7 @@  strip_kind_from_actual (gfc_actual_arglist * actual)
 	         limit = a[S1][S0];
 	         pos0 = S0 + (1 - from0);
 	         pos1 = S1 + (1 - from1);
+	         second_loop_entry = true;
 	         goto lab1;
 	       }
 	     }
@@ -5363,9 +5365,9 @@  strip_kind_from_actual (gfc_actual_arglist * actual)
 	 }
 	 goto lab2;
 	 lab1:;
-	 S1 = from1;
+	 S1 = second_loop_entry ? S1 : from1;
 	 while (S1 <= to1) {
-	   S0 = from0;
+	   S0 = second_loop_entry ? S0 : from0;
 	   while (S0 <= to0) {
 	     if (mask[S1][S0])
 	       if (a[S1][S0] < limit) {
@@ -5373,6 +5375,7 @@  strip_kind_from_actual (gfc_actual_arglist * actual)
 	         pos0 = S + (1 - from0);
 	         pos1 = S + (1 - from1);
 	       }
+	     second_loop_entry = false;
 	     S0++;
 	   }
 	   S1++;
@@ -5444,6 +5447,7 @@  gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op)
   gfc_expr *backexpr;
   gfc_se backse;
   tree pos[GFC_MAX_DIMENSIONS];
+  tree idx[GFC_MAX_DIMENSIONS];
   tree result_var = NULL_TREE;
   int n;
   bool optional_mask;
@@ -5525,6 +5529,8 @@  gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op)
 			       gfc_get_string ("pos%d", i));
       offset[i] = gfc_create_var (gfc_array_index_type,
 				  gfc_get_string ("offset%d", i));
+      idx[i] = gfc_create_var (gfc_array_index_type,
+			       gfc_get_string ("idx%d", i));
     }
 
   /* Walk the arguments.  */
@@ -5609,6 +5615,18 @@  gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op)
 
   gfc_add_modify (&se->pre, limit, tmp);
 
+  /* If we are in a case where we generate two sets of loops, the second one
+     should continue where the first stopped instead of restarting from the
+     beginning.  So nested loops in the second set should have a partial range
+     on the first iteration, but they should start from the beginning and span
+     their full range on the following iterations.  So we use conditionals in
+     the loops lower bounds, and use the following variable in those
+     conditionals to decide whether to use the original loop bound or to use
+     the index at which the loop from the first set stopped.  */
+  tree second_loop_entry = gfc_create_var (logical_type_node,
+					   "second_loop_entry");
+  gfc_add_modify (&se->pre, second_loop_entry, logical_false_node);
+
   /* Initialize the scalarizer.  */
   gfc_init_loopinfo (&loop);
 
@@ -5752,8 +5770,11 @@  gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op)
       tmp = fold_build2_loc (input_location, PLUS_EXPR, TREE_TYPE (pos[i]),
 			     loop.loopvar[i], offset[i]);
       gfc_add_modify (&ifblock, pos[i], tmp);
+      gfc_add_modify (&ifblock, idx[i], loop.loopvar[i]);
     }
 
+  gfc_add_modify (&ifblock, second_loop_entry, logical_true_node);
+
   if (lab1)
     gfc_add_expr_to_block (&ifblock, build1_v (GOTO_EXPR, lab1));
 
@@ -5816,6 +5837,12 @@  gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op)
 
   if (lab1)
     {
+      for (int i = 0; i < loop.dimen; i++)
+	loop.from[i] = fold_build3_loc (input_location, COND_EXPR,
+					TREE_TYPE (loop.from[i]),
+					second_loop_entry, idx[i],
+					loop.from[i]);
+
       gfc_trans_scalarized_loop_boundary (&loop, &body);
 
       stmtblock_t * const outer_block = &loop.code[loop.dimen - 1];
@@ -5921,7 +5948,9 @@  gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op)
 	}
       else
 	tmp = gfc_finish_block (&block);
+
       gfc_add_expr_to_block (&body, tmp);
+      gfc_add_modify (&body, second_loop_entry, logical_false_node);
     }
 
   gfc_trans_scalarizing_loops (&loop, &body);