middle-end: rework vectorizable_store to iterate over single index [PR117557]
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
Hi All,
The testcase
#include <stdint.h>
#include <string.h>
#define N 8
#define L 8
void f(const uint8_t * restrict seq1,
const uint8_t *idx, uint8_t *seq_out) {
for (int i = 0; i < L; ++i) {
uint8_t h = idx[i];
memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
}
}
compiled at -O3 -mcpu=neoverse-n1+sve
miscompiles to:
ld1w z31.s, p3/z, [x23, z29.s, sxtw]
ld1w z29.s, p7/z, [x23, z30.s, sxtw]
st1w z29.s, p7, [x24, z12.s, sxtw]
st1w z31.s, p7, [x24, z12.s, sxtw]
rather than
ld1w z31.s, p3/z, [x23, z29.s, sxtw]
ld1w z29.s, p7/z, [x23, z30.s, sxtw]
st1w z29.s, p7, [x24, z12.s, sxtw]
addvl x3, x24, #2
st1w z31.s, p3, [x3, z12.s, sxtw]
Where two things go wrong, the wrong mask is used and the address pointers to
the stores are wrong.
This issue is happening because the codegen loop in vectorizable_store is a
nested loop where in the outer loop we iterate over ncopies and in the inner
loop we loop over vec_num.
For SLP ncopies == 1 and vec_num == SLP_NUM_STMS, but the loop mask is
determined by only the outerloop index and the pointer address is only updated
in the outer loop.
As such for SLP we always use the same predicate and the same memory location.
This patch flattens the two loops and instead iterates over ncopies * vec_num
and simplified the indexing.
This does not fully fix the gcc_r miscompile error in SPECCPU 2017 as the error
moves somewhere else. I will look at that next but fixes some other libraries
that also started failing.
Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
x86_64-pc-linux-gnu -m32, -m64 and no issues
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR tree-optimization/117557
* tree-vect-stmts.cc (vectorizable_store): Flatten the ncopies and
vec_num loops.
gcc/testsuite/ChangeLog:
PR tree-optimization/117557
* gcc.target/aarch64/pr117557.c: New test.
---
--
Comments
On Wed, 27 Nov 2024, Tamar Christina wrote:
> Hi All,
>
> The testcase
>
> #include <stdint.h>
> #include <string.h>
>
> #define N 8
> #define L 8
>
> void f(const uint8_t * restrict seq1,
> const uint8_t *idx, uint8_t *seq_out) {
> for (int i = 0; i < L; ++i) {
> uint8_t h = idx[i];
> memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
> }
> }
>
> compiled at -O3 -mcpu=neoverse-n1+sve
>
> miscompiles to:
>
> ld1w z31.s, p3/z, [x23, z29.s, sxtw]
> ld1w z29.s, p7/z, [x23, z30.s, sxtw]
> st1w z29.s, p7, [x24, z12.s, sxtw]
> st1w z31.s, p7, [x24, z12.s, sxtw]
>
> rather than
>
> ld1w z31.s, p3/z, [x23, z29.s, sxtw]
> ld1w z29.s, p7/z, [x23, z30.s, sxtw]
> st1w z29.s, p7, [x24, z12.s, sxtw]
> addvl x3, x24, #2
> st1w z31.s, p3, [x3, z12.s, sxtw]
>
> Where two things go wrong, the wrong mask is used and the address pointers to
> the stores are wrong.
>
> This issue is happening because the codegen loop in vectorizable_store is a
> nested loop where in the outer loop we iterate over ncopies and in the inner
> loop we loop over vec_num.
>
> For SLP ncopies == 1 and vec_num == SLP_NUM_STMS, but the loop mask is
> determined by only the outerloop index and the pointer address is only updated
> in the outer loop.
>
> As such for SLP we always use the same predicate and the same memory location.
> This patch flattens the two loops and instead iterates over ncopies * vec_num
> and simplified the indexing.
>
> This does not fully fix the gcc_r miscompile error in SPECCPU 2017 as the error
> moves somewhere else. I will look at that next but fixes some other libraries
> that also started failing.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
> x86_64-pc-linux-gnu -m32, -m64 and no issues
>
> Ok for master?
OK.
Thanks,
Richard.
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/117557
> * tree-vect-stmts.cc (vectorizable_store): Flatten the ncopies and
> vec_num loops.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/117557
> * gcc.target/aarch64/pr117557.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr117557.c b/gcc/testsuite/gcc.target/aarch64/pr117557.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..80b3fde41109988db70eafd715224df0b0029cd1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr117557.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=neoverse-n1+sve -fdump-tree-vect" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <stdint.h>
> +#include <string.h>
> +
> +#define N 8
> +#define L 8
> +
> +/*
> +**f:
> +** ...
> +** ld1w z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +** ld1w z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +** st1w z[0-9]+.s, p\1, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +** incb x([0-9]+), all, mul #2
> +** st1w z[0-9]+.s, p\2, \[x\3, z[0-9]+.s, sxtw\]
> +** ret
> +** ...
> +*/
> +void f(const uint8_t * restrict seq1,
> + const uint8_t *idx, uint8_t *seq_out) {
> + for (int i = 0; i < L; ++i) {
> + uint8_t h = idx[i];
> + memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
> + }
> +}
> +
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c2d5818b2786123fac7afe290d85c7dd2bda4308..4759c274f3ccbb111a907576539b2a8efb7726a3 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9228,7 +9228,8 @@ vectorizable_store (vec_info *vinfo,
> gcc_assert (!grouped_store);
> auto_vec<tree> vec_offsets;
> unsigned int inside_cost = 0, prologue_cost = 0;
> - for (j = 0; j < ncopies; j++)
> + int num_stmts = ncopies * vec_num;
> + for (j = 0; j < num_stmts; j++)
> {
> gimple *new_stmt;
> if (j == 0)
> @@ -9246,14 +9247,14 @@ vectorizable_store (vec_info *vinfo,
> vect_get_slp_defs (op_node, gvec_oprnds[0]);
> else
> vect_get_vec_defs_for_operand (vinfo, first_stmt_info,
> - ncopies, op, gvec_oprnds[0]);
> + num_stmts, op, gvec_oprnds[0]);
> if (mask)
> {
> if (slp_node)
> vect_get_slp_defs (mask_node, &vec_masks);
> else
> vect_get_vec_defs_for_operand (vinfo, stmt_info,
> - ncopies,
> + num_stmts,
> mask, &vec_masks,
> mask_vectype);
> }
> @@ -9279,281 +9280,280 @@ vectorizable_store (vec_info *vinfo,
> }
>
> new_stmt = NULL;
> - for (i = 0; i < vec_num; ++i)
> + if (!costing_p)
> {
> - if (!costing_p)
> - {
> - vec_oprnd = (*gvec_oprnds[0])[vec_num * j + i];
> - if (mask)
> - vec_mask = vec_masks[vec_num * j + i];
> - /* We should have catched mismatched types earlier. */
> - gcc_assert (useless_type_conversion_p (vectype,
> - TREE_TYPE (vec_oprnd)));
> - }
> - unsigned HOST_WIDE_INT align;
> - tree final_mask = NULL_TREE;
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> - if (!costing_p)
> + vec_oprnd = (*gvec_oprnds[0])[j];
> + if (mask)
> + vec_mask = vec_masks[j];
> + /* We should have catched mismatched types earlier. */
> + gcc_assert (useless_type_conversion_p (vectype,
> + TREE_TYPE (vec_oprnd)));
> + }
> + unsigned HOST_WIDE_INT align;
> + tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> + if (!costing_p)
> + {
> + if (loop_masks)
> + final_mask = vect_get_loop_mask (loop_vinfo, gsi,
> + loop_masks, num_stmts,
> + vectype, j);
> + if (vec_mask)
> + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> + final_mask, vec_mask, gsi);
> + }
> +
> + if (gs_info.ifn != IFN_LAST)
> + {
> + if (costing_p)
> {
> - if (loop_masks)
> - final_mask = vect_get_loop_mask (loop_vinfo, gsi,
> - loop_masks,
> - ncopies * vec_num,
> - vectype, j);
> - if (vec_mask)
> - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> - final_mask, vec_mask, gsi);
> + unsigned int cnunits = vect_nunits_for_cost (vectype);
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, scalar_store,
> + stmt_info, slp_node, 0,
> + vect_body);
> + continue;
> }
>
> - if (gs_info.ifn != IFN_LAST)
> - {
> - if (costing_p)
> - {
> - unsigned int cnunits = vect_nunits_for_cost (vectype);
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, scalar_store,
> - stmt_info, slp_node, 0,
> - vect_body);
> - continue;
> - }
> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + vec_offset = vec_offsets[j];
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> - vec_offset = vec_offsets[vec_num * j + i];
> - tree scale = size_int (gs_info.scale);
> + tree scale = size_int (gs_info.scale);
>
> - if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
> - {
> - if (loop_lens)
> - final_len = vect_get_loop_len (loop_vinfo, gsi,
> - loop_lens,
> - ncopies * vec_num,
> - vectype, j, 1);
> - else
> - final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> - signed char biasval
> - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> - bias = build_int_cst (intQI_type_node, biasval);
> - if (!final_mask)
> - {
> - mask_vectype = truth_type_for (vectype);
> - final_mask = build_minus_one_cst (mask_vectype);
> - }
> - }
> + if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
> + {
> + if (loop_lens)
> + final_len = vect_get_loop_len (loop_vinfo, gsi,
> + loop_lens, num_stmts,
> + vectype, j, 1);
> + else
> + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>
> - gcall *call;
> - if (final_len && final_mask)
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> {
> - if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> - call = gimple_build_call_internal (
> - IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> - vec_offset, scale, vec_oprnd, final_mask, final_len,
> - bias);
> - else
> - /* Non-vector offset indicates that prefer to take
> - MASK_LEN_STRIDED_STORE instead of the
> - IFN_MASK_SCATTER_STORE with direct stride arg. */
> - call = gimple_build_call_internal (
> - IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> - vec_offset, vec_oprnd, final_mask, final_len, bias);
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> }
> - else if (final_mask)
> - call = gimple_build_call_internal
> - (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> - vec_offset, scale, vec_oprnd, final_mask);
> + }
> +
> + gcall *call;
> + if (final_len && final_mask)
> + {
> + if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> + vec_offset, scale, vec_oprnd, final_mask, final_len,
> + bias);
> else
> - call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
> - dataref_ptr, vec_offset,
> - scale, vec_oprnd);
> - gimple_call_set_nothrow (call, true);
> - vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> - new_stmt = call;
> + /* Non-vector offset indicates that prefer to take
> + MASK_LEN_STRIDED_STORE instead of the
> + IFN_MASK_SCATTER_STORE with direct stride arg. */
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> + vec_offset, vec_oprnd, final_mask, final_len, bias);
> }
> - else if (gs_info.decl)
> + else if (final_mask)
> + call = gimple_build_call_internal
> + (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> + vec_offset, scale, vec_oprnd, final_mask);
> + else
> + call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
> + dataref_ptr, vec_offset,
> + scale, vec_oprnd);
> + gimple_call_set_nothrow (call, true);
> + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> + new_stmt = call;
> + }
> + else if (gs_info.decl)
> + {
> + /* The builtin decls path for scatter is legacy, x86 only. */
> + gcc_assert (nunits.is_constant ()
> + && (!final_mask
> + || SCALAR_INT_MODE_P
> + (TYPE_MODE (TREE_TYPE (final_mask)))));
> + if (costing_p)
> {
> - /* The builtin decls path for scatter is legacy, x86 only. */
> - gcc_assert (nunits.is_constant ()
> - && (!final_mask
> - || SCALAR_INT_MODE_P
> - (TYPE_MODE (TREE_TYPE (final_mask)))));
> - if (costing_p)
> - {
> - unsigned int cnunits = vect_nunits_for_cost (vectype);
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, scalar_store,
> - stmt_info, slp_node, 0, vect_body);
> - continue;
> - }
> - poly_uint64 offset_nunits
> - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
> - if (known_eq (nunits, offset_nunits))
> - {
> - new_stmt = vect_build_one_scatter_store_call
> + unsigned int cnunits = vect_nunits_for_cost (vectype);
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, scalar_store,
> + stmt_info, slp_node, 0, vect_body);
> + continue;
> + }
> +
> + poly_uint64 offset_nunits
> + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
> + if (known_eq (nunits, offset_nunits))
> + {
> + new_stmt = vect_build_one_scatter_store_call
> (vinfo, stmt_info, gsi, &gs_info,
> - dataref_ptr, vec_offsets[vec_num * j + i],
> + dataref_ptr, vec_offsets[j],
> vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - else if (known_eq (nunits, offset_nunits * 2))
> - {
> - /* We have a offset vector with half the number of
> - lanes but the builtins will store full vectype
> - data from the lower lanes. */
> - new_stmt = vect_build_one_scatter_store_call
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> + else if (known_eq (nunits, offset_nunits * 2))
> + {
> + /* We have a offset vector with half the number of
> + lanes but the builtins will store full vectype
> + data from the lower lanes. */
> + new_stmt = vect_build_one_scatter_store_call
> (vinfo, stmt_info, gsi, &gs_info,
> - dataref_ptr,
> - vec_offsets[2 * vec_num * j + 2 * i],
> + dataref_ptr, vec_offsets[2 * j],
> vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> + vect_finish_stmt_generation (vinfo, stmt_info,
> new_stmt, gsi);
> - int count = nunits.to_constant ();
> - vec_perm_builder sel (count, count, 1);
> - sel.quick_grow (count);
> - for (int i = 0; i < count; ++i)
> - sel[i] = i | (count / 2);
> - vec_perm_indices indices (sel, 2, count);
> - tree perm_mask
> - = vect_gen_perm_mask_checked (vectype, indices);
> - new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
> - vec_oprnd, vec_oprnd,
> - perm_mask);
> - vec_oprnd = make_ssa_name (vectype);
> - gimple_set_lhs (new_stmt, vec_oprnd);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - if (final_mask)
> - {
> - new_stmt = gimple_build_assign (NULL_TREE,
> - VEC_UNPACK_HI_EXPR,
> - final_mask);
> - final_mask = make_ssa_name
> + int count = nunits.to_constant ();
> + vec_perm_builder sel (count, count, 1);
> + sel.quick_grow (count);
> + for (int i = 0; i < count; ++i)
> + sel[i] = i | (count / 2);
> + vec_perm_indices indices (sel, 2, count);
> + tree perm_mask
> + = vect_gen_perm_mask_checked (vectype, indices);
> + new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
> + vec_oprnd, vec_oprnd,
> + perm_mask);
> + vec_oprnd = make_ssa_name (vectype);
> + gimple_set_lhs (new_stmt, vec_oprnd);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + if (final_mask)
> + {
> + new_stmt = gimple_build_assign (NULL_TREE,
> + VEC_UNPACK_HI_EXPR,
> + final_mask);
> + final_mask = make_ssa_name
> (truth_type_for (gs_info.offset_vectype));
> - gimple_set_lhs (new_stmt, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> + gimple_set_lhs (new_stmt, final_mask);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> }
> - new_stmt = vect_build_one_scatter_store_call
> - (vinfo, stmt_info, gsi, &gs_info,
> - dataref_ptr,
> - vec_offsets[2 * vec_num * j + 2 * i + 1],
> - vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - else if (known_eq (nunits * 2, offset_nunits))
> - {
> - /* We have a offset vector with double the number of
> - lanes. Select the low/high part accordingly. */
> - vec_offset = vec_offsets[(vec_num * j + i) / 2];
> - if ((vec_num * j + i) & 1)
> - {
> - int count = offset_nunits.to_constant ();
> - vec_perm_builder sel (count, count, 1);
> - sel.quick_grow (count);
> - for (int i = 0; i < count; ++i)
> - sel[i] = i | (count / 2);
> - vec_perm_indices indices (sel, 2, count);
> - tree perm_mask = vect_gen_perm_mask_checked
> - (TREE_TYPE (vec_offset), indices);
> - new_stmt = gimple_build_assign (NULL_TREE,
> - VEC_PERM_EXPR,
> - vec_offset,
> - vec_offset,
> - perm_mask);
> - vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
> - gimple_set_lhs (new_stmt, vec_offset);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - new_stmt = vect_build_one_scatter_store_call
> +
> + new_stmt = vect_build_one_scatter_store_call
> + (vinfo, stmt_info, gsi, &gs_info,
> + dataref_ptr, vec_offsets[2 * j + 1],
> + vec_oprnd, final_mask);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> + else if (known_eq (nunits * 2, offset_nunits))
> + {
> + /* We have a offset vector with double the number of
> + lanes. Select the low/high part accordingly. */
> + vec_offset = vec_offsets[j / 2];
> + if (j & 1)
> + {
> + int count = offset_nunits.to_constant ();
> + vec_perm_builder sel (count, count, 1);
> + sel.quick_grow (count);
> + for (int i = 0; i < count; ++i)
> + sel[i] = i | (count / 2);
> + vec_perm_indices indices (sel, 2, count);
> + tree perm_mask = vect_gen_perm_mask_checked
> + (TREE_TYPE (vec_offset), indices);
> + new_stmt = gimple_build_assign (NULL_TREE,
> + VEC_PERM_EXPR,
> + vec_offset,
> + vec_offset,
> + perm_mask);
> + vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
> + gimple_set_lhs (new_stmt, vec_offset);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> +
> + new_stmt = vect_build_one_scatter_store_call
> (vinfo, stmt_info, gsi, &gs_info,
> dataref_ptr, vec_offset,
> vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - else
> - gcc_unreachable ();
> - }
> - else
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> + else
> + gcc_unreachable ();
> + }
> + else
> + {
> + /* Emulated scatter. */
> + gcc_assert (!final_mask);
> + if (costing_p)
> {
> - /* Emulated scatter. */
> - gcc_assert (!final_mask);
> - if (costing_p)
> - {
> - unsigned int cnunits = vect_nunits_for_cost (vectype);
> - /* For emulated scatter N offset vector element extracts
> - (we assume the scalar scaling and ptr + offset add is
> - consumed by the load). */
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> - stmt_info, slp_node, 0, vect_body);
> - /* N scalar stores plus extracting the elements. */
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> - stmt_info, slp_node, 0, vect_body);
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, scalar_store,
> - stmt_info, slp_node, 0, vect_body);
> - continue;
> - }
> + unsigned int cnunits = vect_nunits_for_cost (vectype);
> + /* For emulated scatter N offset vector element extracts
> + (we assume the scalar scaling and ptr + offset add is
> + consumed by the load). */
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> + stmt_info, slp_node, 0, vect_body);
> + /* N scalar stores plus extracting the elements. */
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> + stmt_info, slp_node, 0, vect_body);
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, scalar_store,
> + stmt_info, slp_node, 0, vect_body);
> + continue;
> + }
>
> - unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> - unsigned HOST_WIDE_INT const_offset_nunits
> - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
> - vec<constructor_elt, va_gc> *ctor_elts;
> - vec_alloc (ctor_elts, const_nunits);
> - gimple_seq stmts = NULL;
> - tree elt_type = TREE_TYPE (vectype);
> - unsigned HOST_WIDE_INT elt_size
> - = tree_to_uhwi (TYPE_SIZE (elt_type));
> - /* We support offset vectors with more elements
> - than the data vector for now. */
> - unsigned HOST_WIDE_INT factor
> - = const_offset_nunits / const_nunits;
> - vec_offset = vec_offsets[(vec_num * j + i) / factor];
> - unsigned elt_offset
> - = ((vec_num * j + i) % factor) * const_nunits;
> - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> - tree scale = size_int (gs_info.scale);
> - align = get_object_alignment (DR_REF (first_dr_info->dr));
> - tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> - for (unsigned k = 0; k < const_nunits; ++k)
> - {
> - /* Compute the offsetted pointer. */
> - tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> - bitsize_int (k + elt_offset));
> - tree idx
> - = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> - vec_offset, TYPE_SIZE (idx_type), boff);
> - idx = gimple_convert (&stmts, sizetype, idx);
> - idx = gimple_build (&stmts, MULT_EXPR, sizetype,
> - idx, scale);
> - tree ptr
> - = gimple_build (&stmts, PLUS_EXPR,
> - TREE_TYPE (dataref_ptr),
> - dataref_ptr, idx);
> - ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> - /* Extract the element to be stored. */
> - tree elt
> - = gimple_build (&stmts, BIT_FIELD_REF,
> - TREE_TYPE (vectype),
> - vec_oprnd, TYPE_SIZE (elt_type),
> - bitsize_int (k * elt_size));
> - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> - stmts = NULL;
> - tree ref
> - = build2 (MEM_REF, ltype, ptr,
> - build_int_cst (ref_type, 0));
> - new_stmt = gimple_build_assign (ref, elt);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - if (slp)
> - slp_node->push_vec_def (new_stmt);
> + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> + unsigned HOST_WIDE_INT const_offset_nunits
> + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
> + vec<constructor_elt, va_gc> *ctor_elts;
> + vec_alloc (ctor_elts, const_nunits);
> + gimple_seq stmts = NULL;
> + tree elt_type = TREE_TYPE (vectype);
> + unsigned HOST_WIDE_INT elt_size
> + = tree_to_uhwi (TYPE_SIZE (elt_type));
> + /* We support offset vectors with more elements
> + than the data vector for now. */
> + unsigned HOST_WIDE_INT factor
> + = const_offset_nunits / const_nunits;
> + vec_offset = vec_offsets[j / factor];
> + unsigned elt_offset
> + = (j % factor) * const_nunits;
> + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> + tree scale = size_int (gs_info.scale);
> + align = get_object_alignment (DR_REF (first_dr_info->dr));
> + tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> + for (unsigned k = 0; k < const_nunits; ++k)
> + {
> + /* Compute the offsetted pointer. */
> + tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> + bitsize_int (k + elt_offset));
> + tree idx
> + = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> + vec_offset, TYPE_SIZE (idx_type), boff);
> + idx = gimple_convert (&stmts, sizetype, idx);
> + idx = gimple_build (&stmts, MULT_EXPR, sizetype,
> + idx, scale);
> + tree ptr
> + = gimple_build (&stmts, PLUS_EXPR,
> + TREE_TYPE (dataref_ptr),
> + dataref_ptr, idx);
> + ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> + /* Extract the element to be stored. */
> + tree elt
> + = gimple_build (&stmts, BIT_FIELD_REF,
> + TREE_TYPE (vectype),
> + vec_oprnd, TYPE_SIZE (elt_type),
> + bitsize_int (k * elt_size));
> + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> + stmts = NULL;
> + tree ref
> + = build2 (MEM_REF, ltype, ptr,
> + build_int_cst (ref_type, 0));
> + new_stmt = gimple_build_assign (ref, elt);
> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> }
> +
> + if (slp)
> + slp_node->push_vec_def (new_stmt);
> }
> +
> if (!slp && !costing_p)
> STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> }
>
>
>
>
>
new file mode 100644
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mcpu=neoverse-n1+sve -fdump-tree-vect" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint.h>
+#include <string.h>
+
+#define N 8
+#define L 8
+
+/*
+**f:
+** ...
+** ld1w z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
+** ld1w z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
+** st1w z[0-9]+.s, p\1, \[x[0-9]+, z[0-9]+.s, sxtw\]
+** incb x([0-9]+), all, mul #2
+** st1w z[0-9]+.s, p\2, \[x\3, z[0-9]+.s, sxtw\]
+** ret
+** ...
+*/
+void f(const uint8_t * restrict seq1,
+ const uint8_t *idx, uint8_t *seq_out) {
+ for (int i = 0; i < L; ++i) {
+ uint8_t h = idx[i];
+ memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
+ }
+}
+
@@ -9228,7 +9228,8 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (!grouped_store);
auto_vec<tree> vec_offsets;
unsigned int inside_cost = 0, prologue_cost = 0;
- for (j = 0; j < ncopies; j++)
+ int num_stmts = ncopies * vec_num;
+ for (j = 0; j < num_stmts; j++)
{
gimple *new_stmt;
if (j == 0)
@@ -9246,14 +9247,14 @@ vectorizable_store (vec_info *vinfo,
vect_get_slp_defs (op_node, gvec_oprnds[0]);
else
vect_get_vec_defs_for_operand (vinfo, first_stmt_info,
- ncopies, op, gvec_oprnds[0]);
+ num_stmts, op, gvec_oprnds[0]);
if (mask)
{
if (slp_node)
vect_get_slp_defs (mask_node, &vec_masks);
else
vect_get_vec_defs_for_operand (vinfo, stmt_info,
- ncopies,
+ num_stmts,
mask, &vec_masks,
mask_vectype);
}
@@ -9279,281 +9280,280 @@ vectorizable_store (vec_info *vinfo,
}
new_stmt = NULL;
- for (i = 0; i < vec_num; ++i)
+ if (!costing_p)
{
- if (!costing_p)
- {
- vec_oprnd = (*gvec_oprnds[0])[vec_num * j + i];
- if (mask)
- vec_mask = vec_masks[vec_num * j + i];
- /* We should have catched mismatched types earlier. */
- gcc_assert (useless_type_conversion_p (vectype,
- TREE_TYPE (vec_oprnd)));
- }
- unsigned HOST_WIDE_INT align;
- tree final_mask = NULL_TREE;
- tree final_len = NULL_TREE;
- tree bias = NULL_TREE;
- if (!costing_p)
+ vec_oprnd = (*gvec_oprnds[0])[j];
+ if (mask)
+ vec_mask = vec_masks[j];
+ /* We should have catched mismatched types earlier. */
+ gcc_assert (useless_type_conversion_p (vectype,
+ TREE_TYPE (vec_oprnd)));
+ }
+ unsigned HOST_WIDE_INT align;
+ tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
+ tree bias = NULL_TREE;
+ if (!costing_p)
+ {
+ if (loop_masks)
+ final_mask = vect_get_loop_mask (loop_vinfo, gsi,
+ loop_masks, num_stmts,
+ vectype, j);
+ if (vec_mask)
+ final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+ final_mask, vec_mask, gsi);
+ }
+
+ if (gs_info.ifn != IFN_LAST)
+ {
+ if (costing_p)
{
- if (loop_masks)
- final_mask = vect_get_loop_mask (loop_vinfo, gsi,
- loop_masks,
- ncopies * vec_num,
- vectype, j);
- if (vec_mask)
- final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
- final_mask, vec_mask, gsi);
+ unsigned int cnunits = vect_nunits_for_cost (vectype);
+ inside_cost
+ += record_stmt_cost (cost_vec, cnunits, scalar_store,
+ stmt_info, slp_node, 0,
+ vect_body);
+ continue;
}
- if (gs_info.ifn != IFN_LAST)
- {
- if (costing_p)
- {
- unsigned int cnunits = vect_nunits_for_cost (vectype);
- inside_cost
- += record_stmt_cost (cost_vec, cnunits, scalar_store,
- stmt_info, slp_node, 0,
- vect_body);
- continue;
- }
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ vec_offset = vec_offsets[j];
- if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
- vec_offset = vec_offsets[vec_num * j + i];
- tree scale = size_int (gs_info.scale);
+ tree scale = size_int (gs_info.scale);
- if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
- {
- if (loop_lens)
- final_len = vect_get_loop_len (loop_vinfo, gsi,
- loop_lens,
- ncopies * vec_num,
- vectype, j, 1);
- else
- final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
- signed char biasval
- = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
- bias = build_int_cst (intQI_type_node, biasval);
- if (!final_mask)
- {
- mask_vectype = truth_type_for (vectype);
- final_mask = build_minus_one_cst (mask_vectype);
- }
- }
+ if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
+ {
+ if (loop_lens)
+ final_len = vect_get_loop_len (loop_vinfo, gsi,
+ loop_lens, num_stmts,
+ vectype, j, 1);
+ else
+ final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
- gcall *call;
- if (final_len && final_mask)
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
{
- if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
- call = gimple_build_call_internal (
- IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
- vec_offset, scale, vec_oprnd, final_mask, final_len,
- bias);
- else
- /* Non-vector offset indicates that prefer to take
- MASK_LEN_STRIDED_STORE instead of the
- IFN_MASK_SCATTER_STORE with direct stride arg. */
- call = gimple_build_call_internal (
- IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
- vec_offset, vec_oprnd, final_mask, final_len, bias);
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
}
- else if (final_mask)
- call = gimple_build_call_internal
- (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
- vec_offset, scale, vec_oprnd, final_mask);
+ }
+
+ gcall *call;
+ if (final_len && final_mask)
+ {
+ if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+ call = gimple_build_call_internal (
+ IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
+ vec_offset, scale, vec_oprnd, final_mask, final_len,
+ bias);
else
- call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
- dataref_ptr, vec_offset,
- scale, vec_oprnd);
- gimple_call_set_nothrow (call, true);
- vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
- new_stmt = call;
+ /* Non-vector offset indicates that prefer to take
+ MASK_LEN_STRIDED_STORE instead of the
+ IFN_MASK_SCATTER_STORE with direct stride arg. */
+ call = gimple_build_call_internal (
+ IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
+ vec_offset, vec_oprnd, final_mask, final_len, bias);
}
- else if (gs_info.decl)
+ else if (final_mask)
+ call = gimple_build_call_internal
+ (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
+ vec_offset, scale, vec_oprnd, final_mask);
+ else
+ call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
+ dataref_ptr, vec_offset,
+ scale, vec_oprnd);
+ gimple_call_set_nothrow (call, true);
+ vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+ new_stmt = call;
+ }
+ else if (gs_info.decl)
+ {
+ /* The builtin decls path for scatter is legacy, x86 only. */
+ gcc_assert (nunits.is_constant ()
+ && (!final_mask
+ || SCALAR_INT_MODE_P
+ (TYPE_MODE (TREE_TYPE (final_mask)))));
+ if (costing_p)
{
- /* The builtin decls path for scatter is legacy, x86 only. */
- gcc_assert (nunits.is_constant ()
- && (!final_mask
- || SCALAR_INT_MODE_P
- (TYPE_MODE (TREE_TYPE (final_mask)))));
- if (costing_p)
- {
- unsigned int cnunits = vect_nunits_for_cost (vectype);
- inside_cost
- += record_stmt_cost (cost_vec, cnunits, scalar_store,
- stmt_info, slp_node, 0, vect_body);
- continue;
- }
- poly_uint64 offset_nunits
- = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
- if (known_eq (nunits, offset_nunits))
- {
- new_stmt = vect_build_one_scatter_store_call
+ unsigned int cnunits = vect_nunits_for_cost (vectype);
+ inside_cost
+ += record_stmt_cost (cost_vec, cnunits, scalar_store,
+ stmt_info, slp_node, 0, vect_body);
+ continue;
+ }
+
+ poly_uint64 offset_nunits
+ = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
+ if (known_eq (nunits, offset_nunits))
+ {
+ new_stmt = vect_build_one_scatter_store_call
(vinfo, stmt_info, gsi, &gs_info,
- dataref_ptr, vec_offsets[vec_num * j + i],
+ dataref_ptr, vec_offsets[j],
vec_oprnd, final_mask);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
- }
- else if (known_eq (nunits, offset_nunits * 2))
- {
- /* We have a offset vector with half the number of
- lanes but the builtins will store full vectype
- data from the lower lanes. */
- new_stmt = vect_build_one_scatter_store_call
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
+ }
+ else if (known_eq (nunits, offset_nunits * 2))
+ {
+ /* We have a offset vector with half the number of
+ lanes but the builtins will store full vectype
+ data from the lower lanes. */
+ new_stmt = vect_build_one_scatter_store_call
(vinfo, stmt_info, gsi, &gs_info,
- dataref_ptr,
- vec_offsets[2 * vec_num * j + 2 * i],
+ dataref_ptr, vec_offsets[2 * j],
vec_oprnd, final_mask);
- vect_finish_stmt_generation (vinfo, stmt_info,
+ vect_finish_stmt_generation (vinfo, stmt_info,
new_stmt, gsi);
- int count = nunits.to_constant ();
- vec_perm_builder sel (count, count, 1);
- sel.quick_grow (count);
- for (int i = 0; i < count; ++i)
- sel[i] = i | (count / 2);
- vec_perm_indices indices (sel, 2, count);
- tree perm_mask
- = vect_gen_perm_mask_checked (vectype, indices);
- new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
- vec_oprnd, vec_oprnd,
- perm_mask);
- vec_oprnd = make_ssa_name (vectype);
- gimple_set_lhs (new_stmt, vec_oprnd);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
- if (final_mask)
- {
- new_stmt = gimple_build_assign (NULL_TREE,
- VEC_UNPACK_HI_EXPR,
- final_mask);
- final_mask = make_ssa_name
+ int count = nunits.to_constant ();
+ vec_perm_builder sel (count, count, 1);
+ sel.quick_grow (count);
+ for (int i = 0; i < count; ++i)
+ sel[i] = i | (count / 2);
+ vec_perm_indices indices (sel, 2, count);
+ tree perm_mask
+ = vect_gen_perm_mask_checked (vectype, indices);
+ new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
+ vec_oprnd, vec_oprnd,
+ perm_mask);
+ vec_oprnd = make_ssa_name (vectype);
+ gimple_set_lhs (new_stmt, vec_oprnd);
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
+ if (final_mask)
+ {
+ new_stmt = gimple_build_assign (NULL_TREE,
+ VEC_UNPACK_HI_EXPR,
+ final_mask);
+ final_mask = make_ssa_name
(truth_type_for (gs_info.offset_vectype));
- gimple_set_lhs (new_stmt, final_mask);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
+ gimple_set_lhs (new_stmt, final_mask);
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
}
- new_stmt = vect_build_one_scatter_store_call
- (vinfo, stmt_info, gsi, &gs_info,
- dataref_ptr,
- vec_offsets[2 * vec_num * j + 2 * i + 1],
- vec_oprnd, final_mask);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
- }
- else if (known_eq (nunits * 2, offset_nunits))
- {
- /* We have a offset vector with double the number of
- lanes. Select the low/high part accordingly. */
- vec_offset = vec_offsets[(vec_num * j + i) / 2];
- if ((vec_num * j + i) & 1)
- {
- int count = offset_nunits.to_constant ();
- vec_perm_builder sel (count, count, 1);
- sel.quick_grow (count);
- for (int i = 0; i < count; ++i)
- sel[i] = i | (count / 2);
- vec_perm_indices indices (sel, 2, count);
- tree perm_mask = vect_gen_perm_mask_checked
- (TREE_TYPE (vec_offset), indices);
- new_stmt = gimple_build_assign (NULL_TREE,
- VEC_PERM_EXPR,
- vec_offset,
- vec_offset,
- perm_mask);
- vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
- gimple_set_lhs (new_stmt, vec_offset);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
- }
- new_stmt = vect_build_one_scatter_store_call
+
+ new_stmt = vect_build_one_scatter_store_call
+ (vinfo, stmt_info, gsi, &gs_info,
+ dataref_ptr, vec_offsets[2 * j + 1],
+ vec_oprnd, final_mask);
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
+ }
+ else if (known_eq (nunits * 2, offset_nunits))
+ {
+ /* We have a offset vector with double the number of
+ lanes. Select the low/high part accordingly. */
+ vec_offset = vec_offsets[j / 2];
+ if (j & 1)
+ {
+ int count = offset_nunits.to_constant ();
+ vec_perm_builder sel (count, count, 1);
+ sel.quick_grow (count);
+ for (int i = 0; i < count; ++i)
+ sel[i] = i | (count / 2);
+ vec_perm_indices indices (sel, 2, count);
+ tree perm_mask = vect_gen_perm_mask_checked
+ (TREE_TYPE (vec_offset), indices);
+ new_stmt = gimple_build_assign (NULL_TREE,
+ VEC_PERM_EXPR,
+ vec_offset,
+ vec_offset,
+ perm_mask);
+ vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
+ gimple_set_lhs (new_stmt, vec_offset);
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
+ }
+
+ new_stmt = vect_build_one_scatter_store_call
(vinfo, stmt_info, gsi, &gs_info,
dataref_ptr, vec_offset,
vec_oprnd, final_mask);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
- }
- else
- gcc_unreachable ();
- }
- else
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
+ }
+ else
+ gcc_unreachable ();
+ }
+ else
+ {
+ /* Emulated scatter. */
+ gcc_assert (!final_mask);
+ if (costing_p)
{
- /* Emulated scatter. */
- gcc_assert (!final_mask);
- if (costing_p)
- {
- unsigned int cnunits = vect_nunits_for_cost (vectype);
- /* For emulated scatter N offset vector element extracts
- (we assume the scalar scaling and ptr + offset add is
- consumed by the load). */
- inside_cost
- += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
- stmt_info, slp_node, 0, vect_body);
- /* N scalar stores plus extracting the elements. */
- inside_cost
- += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
- stmt_info, slp_node, 0, vect_body);
- inside_cost
- += record_stmt_cost (cost_vec, cnunits, scalar_store,
- stmt_info, slp_node, 0, vect_body);
- continue;
- }
+ unsigned int cnunits = vect_nunits_for_cost (vectype);
+ /* For emulated scatter N offset vector element extracts
+ (we assume the scalar scaling and ptr + offset add is
+ consumed by the load). */
+ inside_cost
+ += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
+ stmt_info, slp_node, 0, vect_body);
+ /* N scalar stores plus extracting the elements. */
+ inside_cost
+ += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
+ stmt_info, slp_node, 0, vect_body);
+ inside_cost
+ += record_stmt_cost (cost_vec, cnunits, scalar_store,
+ stmt_info, slp_node, 0, vect_body);
+ continue;
+ }
- unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
- unsigned HOST_WIDE_INT const_offset_nunits
- = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
- vec<constructor_elt, va_gc> *ctor_elts;
- vec_alloc (ctor_elts, const_nunits);
- gimple_seq stmts = NULL;
- tree elt_type = TREE_TYPE (vectype);
- unsigned HOST_WIDE_INT elt_size
- = tree_to_uhwi (TYPE_SIZE (elt_type));
- /* We support offset vectors with more elements
- than the data vector for now. */
- unsigned HOST_WIDE_INT factor
- = const_offset_nunits / const_nunits;
- vec_offset = vec_offsets[(vec_num * j + i) / factor];
- unsigned elt_offset
- = ((vec_num * j + i) % factor) * const_nunits;
- tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
- tree scale = size_int (gs_info.scale);
- align = get_object_alignment (DR_REF (first_dr_info->dr));
- tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
- for (unsigned k = 0; k < const_nunits; ++k)
- {
- /* Compute the offsetted pointer. */
- tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
- bitsize_int (k + elt_offset));
- tree idx
- = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
- vec_offset, TYPE_SIZE (idx_type), boff);
- idx = gimple_convert (&stmts, sizetype, idx);
- idx = gimple_build (&stmts, MULT_EXPR, sizetype,
- idx, scale);
- tree ptr
- = gimple_build (&stmts, PLUS_EXPR,
- TREE_TYPE (dataref_ptr),
- dataref_ptr, idx);
- ptr = gimple_convert (&stmts, ptr_type_node, ptr);
- /* Extract the element to be stored. */
- tree elt
- = gimple_build (&stmts, BIT_FIELD_REF,
- TREE_TYPE (vectype),
- vec_oprnd, TYPE_SIZE (elt_type),
- bitsize_int (k * elt_size));
- gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
- stmts = NULL;
- tree ref
- = build2 (MEM_REF, ltype, ptr,
- build_int_cst (ref_type, 0));
- new_stmt = gimple_build_assign (ref, elt);
- vect_finish_stmt_generation (vinfo, stmt_info,
- new_stmt, gsi);
- }
- if (slp)
- slp_node->push_vec_def (new_stmt);
+ unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+ unsigned HOST_WIDE_INT const_offset_nunits
+ = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
+ vec<constructor_elt, va_gc> *ctor_elts;
+ vec_alloc (ctor_elts, const_nunits);
+ gimple_seq stmts = NULL;
+ tree elt_type = TREE_TYPE (vectype);
+ unsigned HOST_WIDE_INT elt_size
+ = tree_to_uhwi (TYPE_SIZE (elt_type));
+ /* We support offset vectors with more elements
+ than the data vector for now. */
+ unsigned HOST_WIDE_INT factor
+ = const_offset_nunits / const_nunits;
+ vec_offset = vec_offsets[j / factor];
+ unsigned elt_offset
+ = (j % factor) * const_nunits;
+ tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+ tree scale = size_int (gs_info.scale);
+ align = get_object_alignment (DR_REF (first_dr_info->dr));
+ tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
+ for (unsigned k = 0; k < const_nunits; ++k)
+ {
+ /* Compute the offsetted pointer. */
+ tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+ bitsize_int (k + elt_offset));
+ tree idx
+ = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
+ vec_offset, TYPE_SIZE (idx_type), boff);
+ idx = gimple_convert (&stmts, sizetype, idx);
+ idx = gimple_build (&stmts, MULT_EXPR, sizetype,
+ idx, scale);
+ tree ptr
+ = gimple_build (&stmts, PLUS_EXPR,
+ TREE_TYPE (dataref_ptr),
+ dataref_ptr, idx);
+ ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+ /* Extract the element to be stored. */
+ tree elt
+ = gimple_build (&stmts, BIT_FIELD_REF,
+ TREE_TYPE (vectype),
+ vec_oprnd, TYPE_SIZE (elt_type),
+ bitsize_int (k * elt_size));
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ stmts = NULL;
+ tree ref
+ = build2 (MEM_REF, ltype, ptr,
+ build_int_cst (ref_type, 0));
+ new_stmt = gimple_build_assign (ref, elt);
+ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
}
+
+ if (slp)
+ slp_node->push_vec_def (new_stmt);
}
+
if (!slp && !costing_p)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}