vect: Allow reduc_index != 1 for COND_OPs.

Message ID 3dc56d83-69f9-48db-a062-4e399d9c419c@gmail.com
State New
Headers
Series vect: Allow reduc_index != 1 for COND_OPs. |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm fail Patch failed to apply

Commit Message

Robin Dapp Nov. 21, 2023, 2:44 p.m. UTC
  Hi,

in PR112406 Tamar found another problem with COND_OP reductions.
I wrongly assumed that the reduction variable will always remain in
operand 1, just as we create the COND_OP in ifcvt.  But of course,
addition being commutative, we are free to swap operand 1 and 2 and
can end up with e.g.

 _ifc__60 = .COND_ADD (_2, _6, MADPictureC1_lsm.10_25, MADPictureC1_lsm.10_25);

which does not pass the asserts I put in place.

This patch removes this restriction and allows the reduction index to be
2 as well.

Bootstrapped and regtested on aarch64 and regtested on riscv.  x86 is
still running.

Regards
 Robin

gcc/ChangeLog:

	PR middle-end/112406

	* tree-vect-loop.cc (vectorize_fold_left_reduction): Allow
	reduction index != 1.
	(vect_transform_reduction): Handle reduction index != 1.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/pr112406-2.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/pr112406-2.c | 20 +++++++++++++++++++
 gcc/tree-vect-loop.cc                         | 15 +++++++++-----
 2 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr112406-2.c
  

Comments

Richard Biener Nov. 21, 2023, 2:49 p.m. UTC | #1
On Tue, 21 Nov 2023, Robin Dapp wrote:

> Hi,
> 
> in PR112406 Tamar found another problem with COND_OP reductions.
> I wrongly assumed that the reduction variable will always remain in
> operand 1, just as we create the COND_OP in ifcvt.  But of course,
> addition being commutative, we are free to swap operand 1 and 2 and
> can end up with e.g.
> 
>  _ifc__60 = .COND_ADD (_2, _6, MADPictureC1_lsm.10_25, MADPictureC1_lsm.10_25);
> 
> which does not pass the asserts I put in place.
> 
> This patch removes this restriction and allows the reduction index to be
> 2 as well.
> 
> Bootstrapped and regtested on aarch64 and regtested on riscv.  x86 is
> still running.

LGTM.

Thanks,
Richard.

> Regards
>  Robin
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/112406
> 
> 	* tree-vect-loop.cc (vectorize_fold_left_reduction): Allow
> 	reduction index != 1.
> 	(vect_transform_reduction): Handle reduction index != 1.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/pr112406-2.c: New test.
> ---
>  gcc/testsuite/gcc.target/aarch64/pr112406-2.c | 20 +++++++++++++++++++
>  gcc/tree-vect-loop.cc                         | 15 +++++++++-----
>  2 files changed, 30 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr112406-2.c
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr112406-2.c b/gcc/testsuite/gcc.target/aarch64/pr112406-2.c
> new file mode 100644
> index 00000000000..bb6e9cf7c70
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr112406-2.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-options "-march=armv8-a+sve -Ofast" } */
> +
> +double MADPictureC1;
> +extern int PictureRejected[];
> +int PictureMAD_0, MADModelEstimator_n_windowSize_i, MADModelEstimator_n_windowSize_oneSampleQ;
> +
> +void MADModelEstimator_n_windowSize() {
> +  int estimateX2 = 0;
> +  for (; MADModelEstimator_n_windowSize_i; MADModelEstimator_n_windowSize_i++) {
> +    if (MADModelEstimator_n_windowSize_oneSampleQ &&
> +        !PictureRejected[MADModelEstimator_n_windowSize_i])
> +      estimateX2 = 1;
> +    if (!PictureRejected[MADModelEstimator_n_windowSize_i])
> +      MADPictureC1 += PictureMAD_0;
> +  }
> +  if (estimateX2)
> +    for (;;)
> +      ;
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 58679e91c0a..044eacddf7e 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7070,7 +7070,7 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>      op0 = ops[1 - reduc_index];
>    else
>      {
> -      op0 = ops[2];
> +      op0 = ops[2 + (1 - reduc_index)];
>        opmask = ops[0];
>        gcc_assert (!slp_node);
>      }
> @@ -8455,7 +8455,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
>  		  || code == IFN_COND_MUL || code == IFN_COND_AND
>  		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> -      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +      gcc_assert (op.num_ops == 4
> +		  && (op.ops[reduc_index]
> +		      == op.ops[internal_fn_else_index ((internal_fn) code)]));
>      }
>  
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> @@ -8498,12 +8500,15 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>      {
>        /* For a conditional operation pass the truth type as mask
>  	 vectype.  */
> -      gcc_assert (single_defuse_cycle && reduc_index == 1);
> +      gcc_assert (single_defuse_cycle
> +		  && (reduc_index == 1 || reduc_index == 2));
>        vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  			 op.ops[0], &vec_oprnds0,
>  			 truth_type_for (vectype_in),
> -			 NULL_TREE, &vec_oprnds1, NULL_TREE,
> -			 op.ops[2], &vec_oprnds2, NULL_TREE);
> +			 reduc_index == 1 ? NULL_TREE : op.ops[1],
> +			 &vec_oprnds1, NULL_TREE,
> +			 reduc_index == 2 ? NULL_TREE : op.ops[2],
> +			 &vec_oprnds2, NULL_TREE);
>      }
>  
>    /* For single def-use cycles get one copy of the vectorized reduction
>
  
Robin Dapp Nov. 21, 2023, 7:47 p.m. UTC | #2
>> Bootstrapped and regtested on aarch64 and regtested on riscv.  x86 is
>> still running.

Just to confirm:  x86 bootstrap and regtest unchanged.  Going to commit
it soon.

Regards
 Robin
  

Patch

diff --git a/gcc/testsuite/gcc.target/aarch64/pr112406-2.c b/gcc/testsuite/gcc.target/aarch64/pr112406-2.c
new file mode 100644
index 00000000000..bb6e9cf7c70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112406-2.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-march=armv8-a+sve -Ofast" } */
+
+double MADPictureC1;
+extern int PictureRejected[];
+int PictureMAD_0, MADModelEstimator_n_windowSize_i, MADModelEstimator_n_windowSize_oneSampleQ;
+
+void MADModelEstimator_n_windowSize() {
+  int estimateX2 = 0;
+  for (; MADModelEstimator_n_windowSize_i; MADModelEstimator_n_windowSize_i++) {
+    if (MADModelEstimator_n_windowSize_oneSampleQ &&
+        !PictureRejected[MADModelEstimator_n_windowSize_i])
+      estimateX2 = 1;
+    if (!PictureRejected[MADModelEstimator_n_windowSize_i])
+      MADPictureC1 += PictureMAD_0;
+  }
+  if (estimateX2)
+    for (;;)
+      ;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 58679e91c0a..044eacddf7e 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7070,7 +7070,7 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
     op0 = ops[1 - reduc_index];
   else
     {
-      op0 = ops[2];
+      op0 = ops[2 + (1 - reduc_index)];
       opmask = ops[0];
       gcc_assert (!slp_node);
     }
@@ -8455,7 +8455,9 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
 		  || code == IFN_COND_MUL || code == IFN_COND_AND
 		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
-      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+      gcc_assert (op.num_ops == 4
+		  && (op.ops[reduc_index]
+		      == op.ops[internal_fn_else_index ((internal_fn) code)]));
     }
 
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
@@ -8498,12 +8500,15 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
     {
       /* For a conditional operation pass the truth type as mask
 	 vectype.  */
-      gcc_assert (single_defuse_cycle && reduc_index == 1);
+      gcc_assert (single_defuse_cycle
+		  && (reduc_index == 1 || reduc_index == 2));
       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 			 op.ops[0], &vec_oprnds0,
 			 truth_type_for (vectype_in),
-			 NULL_TREE, &vec_oprnds1, NULL_TREE,
-			 op.ops[2], &vec_oprnds2, NULL_TREE);
+			 reduc_index == 1 ? NULL_TREE : op.ops[1],
+			 &vec_oprnds1, NULL_TREE,
+			 reduc_index == 2 ? NULL_TREE : op.ops[2],
+			 &vec_oprnds2, NULL_TREE);
     }
 
   /* For single def-use cycles get one copy of the vectorized reduction