[2/5] Vect: Introduce MASK_LEN_STRIDED_LOAD{STORE} to loop vectorizer
Checks
Context |
Check |
Description |
rivoscibot/toolchain-ci-rivos-apply-patch |
success
|
Patch applied
|
rivoscibot/toolchain-ci-rivos-lint |
success
|
Lint passed
|
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-multilib |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gc-lp64d-non-multilib |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc-lp64d-non-multilib |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-test |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
Commit Message
From: Pan Li <pan2.li@intel.com>
This patch would like to allow generation of MASK_LEN_STRIDED_LOAD{STORE} IR
for invariant stride memory access. For example as below
void foo (int * __restrict a, int * __restrict b, int stride, int n)
{
for (int i = 0; i < n; i++)
a[i*stride] = b[i*stride] + 100;
}
Before this patch:
66 │ _73 = .SELECT_VL (ivtmp_71, POLY_INT_CST [4, 4]);
67 │ _52 = _54 * _73;
68 │ vect__5.16_61 = .MASK_LEN_GATHER_LOAD (vectp_b.14_59, _58, 4, { 0, ... }, { -1, ... }, _73, 0);
69 │ vect__7.17_63 = vect__5.16_61 + { 100, ... };
70 │ .MASK_LEN_SCATTER_STORE (vectp_a.18_67, _58, 4, vect__7.17_63, { -1, ... }, _73, 0);
71 │ vectp_b.14_60 = vectp_b.14_59 + _52;
72 │ vectp_a.18_68 = vectp_a.18_67 + _52;
73 │ ivtmp_72 = ivtmp_71 - _73;
After this patch:
60 │ _70 = .SELECT_VL (ivtmp_68, POLY_INT_CST [4, 4]);
61 │ _52 = _54 * _70;
62 │ vect__5.16_58 = .MASK_LEN_STRIDED_LOAD (vectp_b.14_56, _55, { 0, ... }, { -1, ... }, _70, 0);
63 │ vect__7.17_60 = vect__5.16_58 + { 100, ... };
64 │ .MASK_LEN_STRIDED_STORE (vectp_a.18_64, _55, vect__7.17_60, { -1, ... }, _70, 0);
65 │ vectp_b.14_57 = vectp_b.14_56 + _52;
66 │ vectp_a.18_65 = vectp_a.18_64 + _52;
67 │ ivtmp_69 = ivtmp_68 - _70;
The below test suites are passed for this patch:
* The x86 bootstrap test.
* The x86 fully regression test.
* The riscv fully regression test.
gcc/ChangeLog:
* tree-vect-stmts.cc (vect_get_strided_load_store_ops): Handle
MASK_LEN_STRIDED_LOAD{STORE} after supported check.
(vectorizable_store): Generate MASK_LEN_STRIDED_LOAD when the offset
of gater is not vector type.
(vectorizable_load): Ditto but for store.
Signed-off-by: Pan Li <pan2.li@intel.com>
Co-Authored-By: Juzhe-Zhong <juzhe.zhong@rivai.ai>
---
gcc/tree-vect-stmts.cc | 45 +++++++++++++++++++++++++++++++++---------
1 file changed, 36 insertions(+), 9 deletions(-)
Comments
On Wed, Oct 23, 2024 at 12:47 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to allow generation of MASK_LEN_STRIDED_LOAD{STORE} IR
> for invariant stride memory access. For example as below
>
> void foo (int * __restrict a, int * __restrict b, int stride, int n)
> {
> for (int i = 0; i < n; i++)
> a[i*stride] = b[i*stride] + 100;
> }
>
> Before this patch:
> 66 │ _73 = .SELECT_VL (ivtmp_71, POLY_INT_CST [4, 4]);
> 67 │ _52 = _54 * _73;
> 68 │ vect__5.16_61 = .MASK_LEN_GATHER_LOAD (vectp_b.14_59, _58, 4, { 0, ... }, { -1, ... }, _73, 0);
> 69 │ vect__7.17_63 = vect__5.16_61 + { 100, ... };
> 70 │ .MASK_LEN_SCATTER_STORE (vectp_a.18_67, _58, 4, vect__7.17_63, { -1, ... }, _73, 0);
> 71 │ vectp_b.14_60 = vectp_b.14_59 + _52;
> 72 │ vectp_a.18_68 = vectp_a.18_67 + _52;
> 73 │ ivtmp_72 = ivtmp_71 - _73;
>
> After this patch:
> 60 │ _70 = .SELECT_VL (ivtmp_68, POLY_INT_CST [4, 4]);
> 61 │ _52 = _54 * _70;
> 62 │ vect__5.16_58 = .MASK_LEN_STRIDED_LOAD (vectp_b.14_56, _55, { 0, ... }, { -1, ... }, _70, 0);
> 63 │ vect__7.17_60 = vect__5.16_58 + { 100, ... };
> 64 │ .MASK_LEN_STRIDED_STORE (vectp_a.18_64, _55, vect__7.17_60, { -1, ... }, _70, 0);
> 65 │ vectp_b.14_57 = vectp_b.14_56 + _52;
> 66 │ vectp_a.18_65 = vectp_a.18_64 + _52;
> 67 │ ivtmp_69 = ivtmp_68 - _70;
>
> The below test suites are passed for this patch:
> * The x86 bootstrap test.
> * The x86 fully regression test.
> * The riscv fully regression test.
>
> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vect_get_strided_load_store_ops): Handle
> MASK_LEN_STRIDED_LOAD{STORE} after supported check.
> (vectorizable_store): Generate MASK_LEN_STRIDED_LOAD when the offset
> of gater is not vector type.
> (vectorizable_load): Ditto but for store.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> Co-Authored-By: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> ---
> gcc/tree-vect-stmts.cc | 45 +++++++++++++++++++++++++++++++++---------
> 1 file changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index e7f14c3144c..78d66a4ef9d 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2950,6 +2950,15 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> }
>
> + internal_fn ifn
> + = DR_IS_READ (dr) ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
> + if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
> + {
> + *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
> + unshare_expr (DR_STEP (dr)));
> + return;
> + }
I'll note that to get here the target has to claim support for general
gather/scatter,
I guess OK for now and RISC-V specifically.
OK.
Thanks,
Richard.
> +
> /* The offset given in GS_INFO can have pointer type, so use the element
> type of the vector instead. */
> tree offset_type = TREE_TYPE (gs_info->offset_vectype);
> @@ -9194,10 +9203,20 @@ vectorizable_store (vec_info *vinfo,
>
> gcall *call;
> if (final_len && final_mask)
> - call = gimple_build_call_internal
> - (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> - vec_offset, scale, vec_oprnd, final_mask,
> - final_len, bias);
> + {
> + if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> + vec_offset, scale, vec_oprnd, final_mask, final_len,
> + bias);
> + else
> + /* Non-vector offset indicates that prefer to take
> + MASK_LEN_STRIDED_STORE instead of the
> + IFN_MASK_SCATTER_STORE with direct stride arg. */
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> + vec_offset, vec_oprnd, final_mask, final_len, bias);
> + }
> else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> @@ -11194,11 +11213,19 @@ vectorizable_load (vec_info *vinfo,
>
> gcall *call;
> if (final_len && final_mask)
> - call
> - = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
> - dataref_ptr, vec_offset,
> - scale, zero, final_mask,
> - final_len, bias);
> + {
> + if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
> + scale, zero, final_mask, final_len, bias);
> + else
> + /* Non-vector offset indicates that prefer to take
> + MASK_LEN_STRIDED_LOAD instead of the
> + MASK_LEN_GATHER_LOAD with direct stride arg. */
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
> + zero, final_mask, final_len, bias);
> + }
> else if (final_mask)
> call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
> dataref_ptr, vec_offset,
> --
> 2.43.0
>
@@ -2950,6 +2950,15 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
*dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
}
+ internal_fn ifn
+ = DR_IS_READ (dr) ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
+ if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+ {
+ *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
+ unshare_expr (DR_STEP (dr)));
+ return;
+ }
+
/* The offset given in GS_INFO can have pointer type, so use the element
type of the vector instead. */
tree offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -9194,10 +9203,20 @@ vectorizable_store (vec_info *vinfo,
gcall *call;
if (final_len && final_mask)
- call = gimple_build_call_internal
- (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
- vec_offset, scale, vec_oprnd, final_mask,
- final_len, bias);
+ {
+ if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+ call = gimple_build_call_internal (
+ IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
+ vec_offset, scale, vec_oprnd, final_mask, final_len,
+ bias);
+ else
+ /* Non-vector offset indicates that prefer to take
+ MASK_LEN_STRIDED_STORE instead of the
+ IFN_MASK_SCATTER_STORE with direct stride arg. */
+ call = gimple_build_call_internal (
+ IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
+ vec_offset, vec_oprnd, final_mask, final_len, bias);
+ }
else if (final_mask)
call = gimple_build_call_internal
(IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
@@ -11194,11 +11213,19 @@ vectorizable_load (vec_info *vinfo,
gcall *call;
if (final_len && final_mask)
- call
- = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
- dataref_ptr, vec_offset,
- scale, zero, final_mask,
- final_len, bias);
+ {
+ if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+ call = gimple_build_call_internal (
+ IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+ scale, zero, final_mask, final_len, bias);
+ else
+ /* Non-vector offset indicates that prefer to take
+ MASK_LEN_STRIDED_LOAD instead of the
+ MASK_LEN_GATHER_LOAD with direct stride arg. */
+ call = gimple_build_call_internal (
+ IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
+ zero, final_mask, final_len, bias);
+ }
else if (final_mask)
call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
dataref_ptr, vec_offset,