vect: Allow reduc_index != 1 for COND_OPs.
Checks
Commit Message
Hi,
in PR112406 Tamar found another problem with COND_OP reductions.
I wrongly assumed that the reduction variable will always remain in
operand 1, just as we create the COND_OP in ifcvt. But of course,
addition being commutative, we are free to swap operand 1 and 2 and
can end up with e.g.
_ifc__60 = .COND_ADD (_2, _6, MADPictureC1_lsm.10_25, MADPictureC1_lsm.10_25);
which does not pass the asserts I put in place.
This patch removes this restriction and allows the reduction index to be
2 as well.
Bootstrapped and regtested on aarch64 and regtested on riscv. x86 is
still running.
Regards
Robin
gcc/ChangeLog:
PR middle-end/112406
* tree-vect-loop.cc (vectorize_fold_left_reduction): Allow
reduction index != 1.
(vect_transform_reduction): Handle reduction index != 1.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/pr112406-2.c: New test.
---
gcc/testsuite/gcc.target/aarch64/pr112406-2.c | 20 +++++++++++++++++++
gcc/tree-vect-loop.cc | 15 +++++++++-----
2 files changed, 30 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/pr112406-2.c
Comments
On Tue, 21 Nov 2023, Robin Dapp wrote:
> Hi,
>
> in PR112406 Tamar found another problem with COND_OP reductions.
> I wrongly assumed that the reduction variable will always remain in
> operand 1, just as we create the COND_OP in ifcvt. But of course,
> addition being commutative, we are free to swap operand 1 and 2 and
> can end up with e.g.
>
> _ifc__60 = .COND_ADD (_2, _6, MADPictureC1_lsm.10_25, MADPictureC1_lsm.10_25);
>
> which does not pass the asserts I put in place.
>
> This patch removes this restriction and allows the reduction index to be
> 2 as well.
>
> Bootstrapped and regtested on aarch64 and regtested on riscv. x86 is
> still running.
LGTM.
Thanks,
Richard.
> Regards
> Robin
>
> gcc/ChangeLog:
>
> PR middle-end/112406
>
> * tree-vect-loop.cc (vectorize_fold_left_reduction): Allow
> reduction index != 1.
> (vect_transform_reduction): Handle reduction index != 1.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/pr112406-2.c: New test.
> ---
> gcc/testsuite/gcc.target/aarch64/pr112406-2.c | 20 +++++++++++++++++++
> gcc/tree-vect-loop.cc | 15 +++++++++-----
> 2 files changed, 30 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr112406-2.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr112406-2.c b/gcc/testsuite/gcc.target/aarch64/pr112406-2.c
> new file mode 100644
> index 00000000000..bb6e9cf7c70
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr112406-2.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-options "-march=armv8-a+sve -Ofast" } */
> +
> +double MADPictureC1;
> +extern int PictureRejected[];
> +int PictureMAD_0, MADModelEstimator_n_windowSize_i, MADModelEstimator_n_windowSize_oneSampleQ;
> +
> +void MADModelEstimator_n_windowSize() {
> + int estimateX2 = 0;
> + for (; MADModelEstimator_n_windowSize_i; MADModelEstimator_n_windowSize_i++) {
> + if (MADModelEstimator_n_windowSize_oneSampleQ &&
> + !PictureRejected[MADModelEstimator_n_windowSize_i])
> + estimateX2 = 1;
> + if (!PictureRejected[MADModelEstimator_n_windowSize_i])
> + MADPictureC1 += PictureMAD_0;
> + }
> + if (estimateX2)
> + for (;;)
> + ;
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 58679e91c0a..044eacddf7e 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7070,7 +7070,7 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
> op0 = ops[1 - reduc_index];
> else
> {
> - op0 = ops[2];
> + op0 = ops[2 + (1 - reduc_index)];
> opmask = ops[0];
> gcc_assert (!slp_node);
> }
> @@ -8455,7 +8455,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> || code == IFN_COND_MUL || code == IFN_COND_AND
> || code == IFN_COND_IOR || code == IFN_COND_XOR);
> - gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> + gcc_assert (op.num_ops == 4
> + && (op.ops[reduc_index]
> + == op.ops[internal_fn_else_index ((internal_fn) code)]));
> }
>
> bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> @@ -8498,12 +8500,15 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> {
> /* For a conditional operation pass the truth type as mask
> vectype. */
> - gcc_assert (single_defuse_cycle && reduc_index == 1);
> + gcc_assert (single_defuse_cycle
> + && (reduc_index == 1 || reduc_index == 2));
> vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> op.ops[0], &vec_oprnds0,
> truth_type_for (vectype_in),
> - NULL_TREE, &vec_oprnds1, NULL_TREE,
> - op.ops[2], &vec_oprnds2, NULL_TREE);
> + reduc_index == 1 ? NULL_TREE : op.ops[1],
> + &vec_oprnds1, NULL_TREE,
> + reduc_index == 2 ? NULL_TREE : op.ops[2],
> + &vec_oprnds2, NULL_TREE);
> }
>
> /* For single def-use cycles get one copy of the vectorized reduction
>
>> Bootstrapped and regtested on aarch64 and regtested on riscv. x86 is
>> still running.
Just to confirm: x86 bootstrap and regtest unchanged. Going to commit
it soon.
Regards
Robin
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-march=armv8-a+sve -Ofast" } */
+
+double MADPictureC1;
+extern int PictureRejected[];
+int PictureMAD_0, MADModelEstimator_n_windowSize_i, MADModelEstimator_n_windowSize_oneSampleQ;
+
+void MADModelEstimator_n_windowSize() {
+ int estimateX2 = 0;
+ for (; MADModelEstimator_n_windowSize_i; MADModelEstimator_n_windowSize_i++) {
+ if (MADModelEstimator_n_windowSize_oneSampleQ &&
+ !PictureRejected[MADModelEstimator_n_windowSize_i])
+ estimateX2 = 1;
+ if (!PictureRejected[MADModelEstimator_n_windowSize_i])
+ MADPictureC1 += PictureMAD_0;
+ }
+ if (estimateX2)
+ for (;;)
+ ;
+}
@@ -7070,7 +7070,7 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
op0 = ops[1 - reduc_index];
else
{
- op0 = ops[2];
+ op0 = ops[2 + (1 - reduc_index)];
opmask = ops[0];
gcc_assert (!slp_node);
}
@@ -8455,7 +8455,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
|| code == IFN_COND_MUL || code == IFN_COND_AND
|| code == IFN_COND_IOR || code == IFN_COND_XOR);
- gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+ gcc_assert (op.num_ops == 4
+ && (op.ops[reduc_index]
+ == op.ops[internal_fn_else_index ((internal_fn) code)]));
}
bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
@@ -8498,12 +8500,15 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
{
/* For a conditional operation pass the truth type as mask
vectype. */
- gcc_assert (single_defuse_cycle && reduc_index == 1);
+ gcc_assert (single_defuse_cycle
+ && (reduc_index == 1 || reduc_index == 2));
vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
op.ops[0], &vec_oprnds0,
truth_type_for (vectype_in),
- NULL_TREE, &vec_oprnds1, NULL_TREE,
- op.ops[2], &vec_oprnds2, NULL_TREE);
+ reduc_index == 1 ? NULL_TREE : op.ops[1],
+ &vec_oprnds1, NULL_TREE,
+ reduc_index == 2 ? NULL_TREE : op.ops[2],
+ &vec_oprnds2, NULL_TREE);
}
/* For single def-use cycles get one copy of the vectorized reduction