vect: Fix mask handling for SLP gathers [PR103761]
Commit Message
check_load_store_for_partial_vectors predates the support for SLP
gathers and so had a hard-coded assumption that gathers/scatters
(and load/stores lanes) would be non-SLP operations. This patch
passes down the slp_node so that the routine can work out how
many vectors are needed in both the SLP and non-SLP cases.
Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Richard
gcc/
PR tree-optimization/103761
* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Replace
the ncopies parameter with an slp_node parameter. Calculate the
number of vectors based on it and vectype. Rename lambda to
group_memory_nvectors.
(vectorizable_store, vectorizable_load): Update calls accordingly.
gcc/testsuite/
PR tree-optimization/103761
* gcc.dg/vect/pr103761.c: New test.
* gcc.target/aarch64/sve/pr103761.c: Likewise.
---
gcc/testsuite/gcc.dg/vect/pr103761.c | 13 +++++++
.../gcc.target/aarch64/sve/pr103761.c | 13 +++++++
gcc/tree-vect-stmts.cc | 37 ++++++++++++-------
3 files changed, 50 insertions(+), 13 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/pr103761.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr103761.c
Comments
On Tue, 5 Apr 2022, Richard Sandiford wrote:
> check_load_store_for_partial_vectors predates the support for SLP
> gathers and so had a hard-coded assumption that gathers/scatters
> (and load/stores lanes) would be non-SLP operations. This patch
> passes down the slp_node so that the routine can work out how
> many vectors are needed in both the SLP and non-SLP cases.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
OK.
Richard.
> Richard
>
>
> gcc/
> PR tree-optimization/103761
> * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Replace
> the ncopies parameter with an slp_node parameter. Calculate the
> number of vectors based on it and vectype. Rename lambda to
> group_memory_nvectors.
> (vectorizable_store, vectorizable_load): Update calls accordingly.
>
> gcc/testsuite/
> PR tree-optimization/103761
> * gcc.dg/vect/pr103761.c: New test.
> * gcc.target/aarch64/sve/pr103761.c: Likewise.
> ---
> gcc/testsuite/gcc.dg/vect/pr103761.c | 13 +++++++
> .../gcc.target/aarch64/sve/pr103761.c | 13 +++++++
> gcc/tree-vect-stmts.cc | 37 ++++++++++++-------
> 3 files changed, 50 insertions(+), 13 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/pr103761.c
> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr103761.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr103761.c b/gcc/testsuite/gcc.dg/vect/pr103761.c
> new file mode 100644
> index 00000000000..0982a63eb6a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr103761.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +
> +void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a)
> +{
> + for (int i = 0; i < 100; i += 4)
> + {
> + x[i] = (long) y[z[i]] + 1;
> + x[i + 1] = (long) y[z[i + 1]] + 2;
> + x[i + 2] = (long) y[z[i + 2]] + 3;
> + x[i + 3] = (long) y[z[i + 3]] + 4;
> + a[i] += 1;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c b/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c
> new file mode 100644
> index 00000000000..001b4d407ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c
> @@ -0,0 +1,13 @@
> +/* { dg-options "-O3" } */
> +
> +void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a)
> +{
> + for (int i = 0; i < 100; i += 4)
> + {
> + x[i] = (long) y[z[i]] + 1;
> + x[i + 1] = (long) y[z[i + 1]] + 2;
> + x[i + 2] = (long) y[z[i + 2]] + 3;
> + x[i + 3] = (long) y[z[i + 3]] + 4;
> + a[i] += 1;
> + }
> +}
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index f6fc7e1fcdd..c0107c8c489 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1690,7 +1690,8 @@ static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
> as well as whether the target does.
>
> VLS_TYPE says whether the statement is a load or store and VECTYPE
> - is the type of the vector being loaded or stored. MEMORY_ACCESS_TYPE
> + is the type of the vector being loaded or stored. SLP_NODE is the SLP
> + node that contains the statement, or null if none. MEMORY_ACCESS_TYPE
> says how the load or store is going to be implemented and GROUP_SIZE
> is the number of load or store statements in the containing group.
> If the access is a gather load or scatter store, GS_INFO describes
> @@ -1703,11 +1704,11 @@ static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
>
> static void
> check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> + slp_tree slp_node,
> vec_load_store_type vls_type,
> int group_size,
> vect_memory_access_type
> memory_access_type,
> - unsigned int ncopies,
> gather_scatter_info *gs_info,
> tree scalar_mask)
> {
> @@ -1715,6 +1716,12 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> if (memory_access_type == VMAT_INVARIANT)
> return;
>
> + unsigned int nvectors;
> + if (slp_node)
> + nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> + else
> + nvectors = vect_get_num_copies (loop_vinfo, vectype);
> +
> vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> machine_mode vecmode = TYPE_MODE (vectype);
> bool is_load = (vls_type == VLS_LOAD);
> @@ -1732,7 +1739,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> return;
> }
> - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
> + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> + scalar_mask);
> return;
> }
>
> @@ -1754,7 +1762,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> return;
> }
> - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
> + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> + scalar_mask);
> return;
> }
>
> @@ -1784,7 +1793,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> /* We might load more scalars than we need for permuting SLP loads.
> We checked in get_group_load_store_type that the extra elements
> don't leak into a new vector. */
> - auto get_valid_nvectors = [] (poly_uint64 size, poly_uint64 nunits)
> + auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits)
> {
> unsigned int nvectors;
> if (can_div_away_from_zero_p (size, nunits, &nvectors))
> @@ -1799,7 +1808,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
> && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
> {
> - unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits);
> + nvectors = group_memory_nvectors (group_size * vf, nunits);
> vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
> using_partial_vectors_p = true;
> }
> @@ -1807,7 +1816,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> machine_mode vmode;
> if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
> {
> - unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits);
> + nvectors = group_memory_nvectors (group_size * vf, nunits);
> vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
> vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
> @@ -7571,9 +7580,10 @@ vectorizable_store (vec_info *vinfo,
>
> if (loop_vinfo
> && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> - check_load_store_for_partial_vectors (loop_vinfo, vectype, vls_type,
> - group_size, memory_access_type,
> - ncopies, &gs_info, mask);
> + check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
> + vls_type, group_size,
> + memory_access_type, &gs_info,
> + mask);
>
> if (slp_node
> && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
> @@ -8921,9 +8931,10 @@ vectorizable_load (vec_info *vinfo,
>
> if (loop_vinfo
> && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> - check_load_store_for_partial_vectors (loop_vinfo, vectype, VLS_LOAD,
> - group_size, memory_access_type,
> - ncopies, &gs_info, mask);
> + check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
> + VLS_LOAD, group_size,
> + memory_access_type, &gs_info,
> + mask);
>
> if (dump_enabled_p ()
> && memory_access_type != VMAT_ELEMENTWISE
>
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a)
+{
+ for (int i = 0; i < 100; i += 4)
+ {
+ x[i] = (long) y[z[i]] + 1;
+ x[i + 1] = (long) y[z[i + 1]] + 2;
+ x[i + 2] = (long) y[z[i + 2]] + 3;
+ x[i + 3] = (long) y[z[i + 3]] + 4;
+ a[i] += 1;
+ }
+}
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-options "-O3" } */
+
+void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a)
+{
+ for (int i = 0; i < 100; i += 4)
+ {
+ x[i] = (long) y[z[i]] + 1;
+ x[i + 1] = (long) y[z[i + 1]] + 2;
+ x[i + 2] = (long) y[z[i + 2]] + 3;
+ x[i + 3] = (long) y[z[i + 3]] + 4;
+ a[i] += 1;
+ }
+}
@@ -1690,7 +1690,8 @@ static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
as well as whether the target does.
VLS_TYPE says whether the statement is a load or store and VECTYPE
- is the type of the vector being loaded or stored. MEMORY_ACCESS_TYPE
+ is the type of the vector being loaded or stored. SLP_NODE is the SLP
+ node that contains the statement, or null if none. MEMORY_ACCESS_TYPE
says how the load or store is going to be implemented and GROUP_SIZE
is the number of load or store statements in the containing group.
If the access is a gather load or scatter store, GS_INFO describes
@@ -1703,11 +1704,11 @@ static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
static void
check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
+ slp_tree slp_node,
vec_load_store_type vls_type,
int group_size,
vect_memory_access_type
memory_access_type,
- unsigned int ncopies,
gather_scatter_info *gs_info,
tree scalar_mask)
{
@@ -1715,6 +1716,12 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
if (memory_access_type == VMAT_INVARIANT)
return;
+ unsigned int nvectors;
+ if (slp_node)
+ nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ else
+ nvectors = vect_get_num_copies (loop_vinfo, vectype);
+
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
machine_mode vecmode = TYPE_MODE (vectype);
bool is_load = (vls_type == VLS_LOAD);
@@ -1732,7 +1739,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
return;
}
- vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
+ scalar_mask);
return;
}
@@ -1754,7 +1762,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
return;
}
- vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
+ scalar_mask);
return;
}
@@ -1784,7 +1793,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
/* We might load more scalars than we need for permuting SLP loads.
We checked in get_group_load_store_type that the extra elements
don't leak into a new vector. */
- auto get_valid_nvectors = [] (poly_uint64 size, poly_uint64 nunits)
+ auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits)
{
unsigned int nvectors;
if (can_div_away_from_zero_p (size, nunits, &nvectors))
@@ -1799,7 +1808,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
&& can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
{
- unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits);
+ nvectors = group_memory_nvectors (group_size * vf, nunits);
vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
using_partial_vectors_p = true;
}
@@ -1807,7 +1816,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
machine_mode vmode;
if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
{
- unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits);
+ nvectors = group_memory_nvectors (group_size * vf, nunits);
vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
@@ -7571,9 +7580,10 @@ vectorizable_store (vec_info *vinfo,
if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
- check_load_store_for_partial_vectors (loop_vinfo, vectype, vls_type,
- group_size, memory_access_type,
- ncopies, &gs_info, mask);
+ check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
+ vls_type, group_size,
+ memory_access_type, &gs_info,
+ mask);
if (slp_node
&& !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
@@ -8921,9 +8931,10 @@ vectorizable_load (vec_info *vinfo,
if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
- check_load_store_for_partial_vectors (loop_vinfo, vectype, VLS_LOAD,
- group_size, memory_access_type,
- ncopies, &gs_info, mask);
+ check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
+ VLS_LOAD, group_size,
+ memory_access_type, &gs_info,
+ mask);
if (dump_enabled_p ()
&& memory_access_type != VMAT_ELEMENTWISE