[2/5] Vect: Introduce MASK_LEN_STRIDED_LOAD{STORE} to loop vectorizer

Message ID 20241023104516.2818244-2-pan2.li@intel.com
State Committed
Commit a0292ddb21475e8fd238e201d3b64f0ab02ace04
Headers
Series [1/5] Internal-fn: Introduce new IFN MASK_LEN_STRIDED_LOAD{STORE} |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-apply-patch success Patch applied
rivoscibot/toolchain-ci-rivos-lint success Lint passed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gc-lp64d-non-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc-lp64d-non-multilib success Build passed
rivoscibot/toolchain-ci-rivos-test success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed

Commit Message

Li, Pan2 Oct. 23, 2024, 10:45 a.m. UTC
  From: Pan Li <pan2.li@intel.com>

This patch would like to allow generation of MASK_LEN_STRIDED_LOAD{STORE} IR
for invariant stride memory access.  For example as below

void foo (int * __restrict a, int * __restrict b, int stride, int n)
{
    for (int i = 0; i < n; i++)
      a[i*stride] = b[i*stride] + 100;
}

Before this patch:
  66   │   _73 = .SELECT_VL (ivtmp_71, POLY_INT_CST [4, 4]);
  67   │   _52 = _54 * _73;
  68   │   vect__5.16_61 = .MASK_LEN_GATHER_LOAD (vectp_b.14_59, _58, 4, { 0, ... }, { -1, ... }, _73, 0);
  69   │   vect__7.17_63 = vect__5.16_61 + { 100, ... };
  70   │   .MASK_LEN_SCATTER_STORE (vectp_a.18_67, _58, 4, vect__7.17_63, { -1, ... }, _73, 0);
  71   │   vectp_b.14_60 = vectp_b.14_59 + _52;
  72   │   vectp_a.18_68 = vectp_a.18_67 + _52;
  73   │   ivtmp_72 = ivtmp_71 - _73;

After this patch:
  60   │   _70 = .SELECT_VL (ivtmp_68, POLY_INT_CST [4, 4]);
  61   │   _52 = _54 * _70;
  62   │   vect__5.16_58 = .MASK_LEN_STRIDED_LOAD (vectp_b.14_56, _55, { 0, ... }, { -1, ... }, _70, 0);
  63   │   vect__7.17_60 = vect__5.16_58 + { 100, ... };
  64   │   .MASK_LEN_STRIDED_STORE (vectp_a.18_64, _55, vect__7.17_60, { -1, ... }, _70, 0);
  65   │   vectp_b.14_57 = vectp_b.14_56 + _52;
  66   │   vectp_a.18_65 = vectp_a.18_64 + _52;
  67   │   ivtmp_69 = ivtmp_68 - _70;

The below test suites are passed for this patch:
* The x86 bootstrap test.
* The x86 fully regression test.
* The riscv fully regression test.

gcc/ChangeLog:

	* tree-vect-stmts.cc (vect_get_strided_load_store_ops): Handle
	MASK_LEN_STRIDED_LOAD{STORE} after supported check.
	(vectorizable_store): Generate MASK_LEN_STRIDED_LOAD when the offset
	of gater is not vector type.
	(vectorizable_load): Ditto but for store.

Signed-off-by: Pan Li <pan2.li@intel.com>
Co-Authored-By: Juzhe-Zhong <juzhe.zhong@rivai.ai>
---
 gcc/tree-vect-stmts.cc | 45 +++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)
  

Comments

Richard Biener Oct. 29, 2024, 10:46 a.m. UTC | #1
On Wed, Oct 23, 2024 at 12:47 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to allow generation of MASK_LEN_STRIDED_LOAD{STORE} IR
> for invariant stride memory access.  For example as below
>
> void foo (int * __restrict a, int * __restrict b, int stride, int n)
> {
>     for (int i = 0; i < n; i++)
>       a[i*stride] = b[i*stride] + 100;
> }
>
> Before this patch:
>   66   │   _73 = .SELECT_VL (ivtmp_71, POLY_INT_CST [4, 4]);
>   67   │   _52 = _54 * _73;
>   68   │   vect__5.16_61 = .MASK_LEN_GATHER_LOAD (vectp_b.14_59, _58, 4, { 0, ... }, { -1, ... }, _73, 0);
>   69   │   vect__7.17_63 = vect__5.16_61 + { 100, ... };
>   70   │   .MASK_LEN_SCATTER_STORE (vectp_a.18_67, _58, 4, vect__7.17_63, { -1, ... }, _73, 0);
>   71   │   vectp_b.14_60 = vectp_b.14_59 + _52;
>   72   │   vectp_a.18_68 = vectp_a.18_67 + _52;
>   73   │   ivtmp_72 = ivtmp_71 - _73;
>
> After this patch:
>   60   │   _70 = .SELECT_VL (ivtmp_68, POLY_INT_CST [4, 4]);
>   61   │   _52 = _54 * _70;
>   62   │   vect__5.16_58 = .MASK_LEN_STRIDED_LOAD (vectp_b.14_56, _55, { 0, ... }, { -1, ... }, _70, 0);
>   63   │   vect__7.17_60 = vect__5.16_58 + { 100, ... };
>   64   │   .MASK_LEN_STRIDED_STORE (vectp_a.18_64, _55, vect__7.17_60, { -1, ... }, _70, 0);
>   65   │   vectp_b.14_57 = vectp_b.14_56 + _52;
>   66   │   vectp_a.18_65 = vectp_a.18_64 + _52;
>   67   │   ivtmp_69 = ivtmp_68 - _70;
>
> The below test suites are passed for this patch:
> * The x86 bootstrap test.
> * The x86 fully regression test.
> * The riscv fully regression test.
>
> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (vect_get_strided_load_store_ops): Handle
>         MASK_LEN_STRIDED_LOAD{STORE} after supported check.
>         (vectorizable_store): Generate MASK_LEN_STRIDED_LOAD when the offset
>         of gater is not vector type.
>         (vectorizable_load): Ditto but for store.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> Co-Authored-By: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> ---
>  gcc/tree-vect-stmts.cc | 45 +++++++++++++++++++++++++++++++++---------
>  1 file changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index e7f14c3144c..78d66a4ef9d 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2950,6 +2950,15 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
>        *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
>      }
>
> +  internal_fn ifn
> +    = DR_IS_READ (dr) ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
> +  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
> +    {
> +      *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
> +                                                  unshare_expr (DR_STEP (dr)));
> +      return;
> +    }

I'll note that to get here the target has to claim support for general
gather/scatter,
I guess OK for now and RISC-V specifically.

OK.

Thanks,
Richard.

> +
>    /* The offset given in GS_INFO can have pointer type, so use the element
>       type of the vector instead.  */
>    tree offset_type = TREE_TYPE (gs_info->offset_vectype);
> @@ -9194,10 +9203,20 @@ vectorizable_store (vec_info *vinfo,
>
>                   gcall *call;
>                   if (final_len && final_mask)
> -                   call = gimple_build_call_internal
> -                            (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> -                             vec_offset, scale, vec_oprnd, final_mask,
> -                             final_len, bias);
> +                   {
> +                     if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> +                       call = gimple_build_call_internal (
> +                         IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> +                         vec_offset, scale, vec_oprnd, final_mask, final_len,
> +                         bias);
> +                     else
> +                       /* Non-vector offset indicates that prefer to take
> +                          MASK_LEN_STRIDED_STORE instead of the
> +                          IFN_MASK_SCATTER_STORE with direct stride arg.  */
> +                       call = gimple_build_call_internal (
> +                         IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> +                         vec_offset, vec_oprnd, final_mask, final_len, bias);
> +                   }
>                   else if (final_mask)
>                     call = gimple_build_call_internal
>                              (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> @@ -11194,11 +11213,19 @@ vectorizable_load (vec_info *vinfo,
>
>                   gcall *call;
>                   if (final_len && final_mask)
> -                   call
> -                     = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
> -                                                   dataref_ptr, vec_offset,
> -                                                   scale, zero, final_mask,
> -                                                   final_len, bias);
> +                   {
> +                     if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> +                       call = gimple_build_call_internal (
> +                         IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
> +                         scale, zero, final_mask, final_len, bias);
> +                     else
> +                       /* Non-vector offset indicates that prefer to take
> +                          MASK_LEN_STRIDED_LOAD instead of the
> +                          MASK_LEN_GATHER_LOAD with direct stride arg.  */
> +                       call = gimple_build_call_internal (
> +                         IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
> +                         zero, final_mask, final_len, bias);
> +                   }
>                   else if (final_mask)
>                     call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
>                                                        dataref_ptr, vec_offset,
> --
> 2.43.0
>
  

Patch

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index e7f14c3144c..78d66a4ef9d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2950,6 +2950,15 @@  vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
       *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
     }
 
+  internal_fn ifn
+    = DR_IS_READ (dr) ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
+  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+    {
+      *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
+						   unshare_expr (DR_STEP (dr)));
+      return;
+    }
+
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
   tree offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -9194,10 +9203,20 @@  vectorizable_store (vec_info *vinfo,
 
 		  gcall *call;
 		  if (final_len && final_mask)
-		    call = gimple_build_call_internal
-			     (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
-			      vec_offset, scale, vec_oprnd, final_mask,
-			      final_len, bias);
+		    {
+		      if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+			call = gimple_build_call_internal (
+			  IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
+			  vec_offset, scale, vec_oprnd, final_mask, final_len,
+			  bias);
+		      else
+			/* Non-vector offset indicates that prefer to take
+			   MASK_LEN_STRIDED_STORE instead of the
+			   IFN_MASK_SCATTER_STORE with direct stride arg.  */
+			call = gimple_build_call_internal (
+			  IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
+			  vec_offset, vec_oprnd, final_mask, final_len, bias);
+		    }
 		  else if (final_mask)
 		    call = gimple_build_call_internal
 			     (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
@@ -11194,11 +11213,19 @@  vectorizable_load (vec_info *vinfo,
 
 		  gcall *call;
 		  if (final_len && final_mask)
-		    call
-		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
-						    dataref_ptr, vec_offset,
-						    scale, zero, final_mask,
-						    final_len, bias);
+		    {
+		      if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+			call = gimple_build_call_internal (
+			  IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+			  scale, zero, final_mask, final_len, bias);
+		      else
+			/* Non-vector offset indicates that prefer to take
+			   MASK_LEN_STRIDED_LOAD instead of the
+			   MASK_LEN_GATHER_LOAD with direct stride arg.  */
+			call = gimple_build_call_internal (
+			  IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
+			  zero, final_mask, final_len, bias);
+		    }
 		  else if (final_mask)
 		    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
 						       dataref_ptr, vec_offset,