VECT: Add SELECT_VL support

Message ID 20230525152609.280197-1-juzhe.zhong@rivai.ai
State New
Headers
Series VECT: Add SELECT_VL support |

Commit Message

juzhe.zhong@rivai.ai May 25, 2023, 3:26 p.m. UTC
  From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

This patch is adding SELECT_VL middle-end support
allow target have target dependent optimization in case of
length calculation.

This patch is inspired by RVV ISA and LLVM:
https://reviews.llvm.org/D99750

The SELECT_VL is same behavior as LLVM "get_vector_length" with
these following properties:

1. Only apply on single-rgroup.
2. non SLP.
3. adjust loop control IV.
4. adjust data reference IV.
5. allow non-vf elements processing in non-final iteration

Code:
   # void vvaddint32(size_t n, const int*x, const int*y, int*z)
    # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }

Take RVV codegen for example:

Before this patch:
vvaddint32:
        ble     a0,zero,.L6
        csrr    a4,vlenb
        srli    a6,a4,2
.L4:
        mv      a5,a0
        bleu    a0,a6,.L3
        mv      a5,a6
.L3:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli a7,zero,e32,m1,ta,ma
        sub     a0,a0,a5
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a3)
        add     a2,a2,a4
        add     a3,a3,a4
        add     a1,a1,a4
        bne     a0,zero,.L4
.L6:
        ret

After this patch:

vvaddint32:
    vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
    vle32.v v0, (a1)         # Get first vector
      sub a0, a0, t0         # Decrement number done
      slli t0, t0, 2         # Multiply number done by 4 bytes
      add a1, a1, t0         # Bump pointer
    vle32.v v1, (a2)         # Get second vector
      add a2, a2, t0         # Bump pointer
    vadd.vv v2, v0, v1       # Sum vectors
    vse32.v v2, (a3)         # Store result
      add a3, a3, t0         # Bump pointer
      bnez a0, vvaddint32    # Loop back
      ret                    # Finished

gcc/ChangeLog:

        * doc/md.texi: Add SELECT_VL support.
        * internal-fn.def (SELECT_VL): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
        * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.

---
 gcc/doc/md.texi             | 22 ++++++++++++
 gcc/internal-fn.def         |  1 +
 gcc/optabs.def              |  1 +
 gcc/tree-vect-loop-manip.cc | 13 ++++---
 gcc/tree-vect-loop.cc       |  9 +++++
 gcc/tree-vect-stmts.cc      | 70 +++++++++++++++++++++++++++++++++++++
 gcc/tree-vectorizer.h       |  6 ++++
 7 files changed, 118 insertions(+), 4 deletions(-)
  

Comments

juzhe.zhong@rivai.ai May 30, 2023, 7:30 a.m. UTC | #1
Hi, this patch is bootstrapped PASS.

Ok for trunk ?

Thanks.


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-05-25 23:26
To: gcc-patches
CC: richard.sandiford; rguenther; Ju-Zhe Zhong
Subject: [PATCH] VECT: Add SELECT_VL support
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
 
This patch is adding SELECT_VL middle-end support
allow target have target dependent optimization in case of
length calculation.
 
This patch is inspired by RVV ISA and LLVM:
https://reviews.llvm.org/D99750
 
The SELECT_VL is same behavior as LLVM "get_vector_length" with
these following properties:
 
1. Only apply on single-rgroup.
2. non SLP.
3. adjust loop control IV.
4. adjust data reference IV.
5. allow non-vf elements processing in non-final iteration
 
Code:
   # void vvaddint32(size_t n, const int*x, const int*y, int*z)
    # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
 
Take RVV codegen for example:
 
Before this patch:
vvaddint32:
        ble     a0,zero,.L6
        csrr    a4,vlenb
        srli    a6,a4,2
.L4:
        mv      a5,a0
        bleu    a0,a6,.L3
        mv      a5,a6
.L3:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli a7,zero,e32,m1,ta,ma
        sub     a0,a0,a5
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a3)
        add     a2,a2,a4
        add     a3,a3,a4
        add     a1,a1,a4
        bne     a0,zero,.L4
.L6:
        ret
 
After this patch:
 
vvaddint32:
    vsetvli t0, a0, e32, ta, ma  # Set vector length based on 32-bit vectors
    vle32.v v0, (a1)         # Get first vector
      sub a0, a0, t0         # Decrement number done
      slli t0, t0, 2         # Multiply number done by 4 bytes
      add a1, a1, t0         # Bump pointer
    vle32.v v1, (a2)         # Get second vector
      add a2, a2, t0         # Bump pointer
    vadd.vv v2, v0, v1       # Sum vectors
    vse32.v v2, (a3)         # Store result
      add a3, a3, t0         # Bump pointer
      bnez a0, vvaddint32    # Loop back
      ret                    # Finished
 
gcc/ChangeLog:
 
        * doc/md.texi: Add SELECT_VL support.
        * internal-fn.def (SELECT_VL): Ditto.
        * optabs.def (OPTAB_D): Ditto.
        * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
        * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
 
---
gcc/doc/md.texi             | 22 ++++++++++++
gcc/internal-fn.def         |  1 +
gcc/optabs.def              |  1 +
gcc/tree-vect-loop-manip.cc | 13 ++++---
gcc/tree-vect-loop.cc       |  9 +++++
gcc/tree-vect-stmts.cc      | 70 +++++++++++++++++++++++++++++++++++++
gcc/tree-vectorizer.h       |  6 ++++
7 files changed, 118 insertions(+), 4 deletions(-)
 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6a435eb4461..95f7fe1f802 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4974,6 +4974,28 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of scalar iterations that should be handled
+by one iteration of a vector loop.  Operand 1 is the total number of
+scalar iterations that the loop needs to process and operand 2 is a
+maximum bound on the result (also known as the maximum ``vectorization
+factor'').
+
+The maximum value of operand 0 is given by:
+@smallexample
+operand0 = MIN (operand1, operand2)
+@end smallexample
+However, targets might choose a lower value than this, based on
+target-specific criteria.  Each iteration of the vector loop might
+therefore process a different number of scalar iterations, which in turn
+means that induction variables will have a variable step.  Because of
+this, it is generally not useful to define this instruction if it will
+always calculate the maximum value.
+
+This optab is only useful on targets that implement @samp{len_load_@var{m}}
+and/or @samp{len_store_@var{m}}.
+
@cindex @code{check_raw_ptrs@var{m}} instruction pattern
@item @samp{check_raw_ptrs@var{m}}
Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..6f6fa7d37f9 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
       check_raw_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..b637471b76e 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
OPTAB_D (len_load_optab, "len_load_$a")
OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index acf3642ceb2..3ca61663684 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -534,7 +534,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   _10 = (unsigned long) count_12(D);
   ...
   # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
-    _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+    _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
   ...
   vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
   ...
@@ -551,9 +551,14 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
       /* Create decrement IV.  */
       create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
insert_after, &index_before_incr, &index_after_incr);
-      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
-     index_before_incr,
-     nitems_step));
+      tree len = NULL_TREE;
+      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+ len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
+     index_before_incr, nitems_step);
+      else
+ len = gimple_build (header_seq, MIN_EXPR, iv_type, index_before_incr,
+     nitems_step);
+      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
       *iv_step = step;
       return index_after_incr;
     }
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5b7a0da0034..f67340976c8 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -974,6 +974,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
     using_decrementing_iv_p (false),
+    using_select_vl_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2737,6 +2738,14 @@ start_over:
LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+  /* If we're using decrement IV and SELECT_VL is supported by the target.
+     Use output of SELECT_VL to adjust IV of loop control and data reference.
+     Note: We only use SELECT_VL on single-rgroup control.  */
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+      && LOOP_VINFO_LENS (loop_vinfo).length () == 1
+      && !slp)
+    LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 127b987cd62..8e8b0f71a4a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3147,6 +3147,61 @@ vect_get_data_ptr_increment (vec_info *vinfo,
   return iv_step;
}
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+   Such variable amount is the outcome of .SELECT_VL. In this case, we can
+   allow each iteration process the flexible number of elements as long as
+   the number <= vf elments.
+
+   Return data reference according to SELECT_VL.
+   If new statements are needed, insert them before GSI.  */
+
+static tree
+get_select_vl_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
+     tree aggr_type, class loop *at_loop, tree offset,
+     tree *dummy, gimple_stmt_iterator *gsi,
+     bool simd_lane_access_p, vec_loop_lens *loop_lens,
+     dr_vec_info *dr_info,
+     vect_memory_access_type memory_access_type)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  tree step = vect_dr_behavior (vinfo, dr_info)->step;
+
+  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
+     IVs are updated by variable amount but we will support them in the future.
+   */
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+       && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+  /* When we support SELECT_VL pattern, we dynamic adjust
+     the memory address by .SELECT_VL result.
+
+     The result of .SELECT_VL is the number of elements to
+     be processed of each iteration. So the memory address
+     adjustment operation should be:
+
+     bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+     addr = addr + .SELECT_VL (ARG..) * bytesize;
+  */
+  gimple *ptr_incr;
+  tree loop_len
+    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
+  tree len_type = TREE_TYPE (loop_len);
+  poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+  /* Since the outcome of .SELECT_VL is element size, we should adjust
+     it into bytesize so that it can be used in address pointer variable
+     amount IVs adjustment.  */
+  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+   build_int_cst (len_type, bytesize));
+  if (tree_int_cst_sgn (step) == -1)
+    tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
+  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+  gassign *assign = gimple_build_assign (bump, tmp);
+  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+  return vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop, offset,
+    dummy, gsi, &ptr_incr, simd_lane_access_p,
+    bump);
+}
+
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}.  */
static bool
@@ -8558,6 +8613,14 @@ vectorizable_store (vec_info *vinfo,
    vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
slp_node, &gs_info, &dataref_ptr,
&vec_offsets);
+   else if (loop_lens && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+    && memory_access_type != VMAT_INVARIANT)
+     dataref_ptr
+       = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+     simd_lane_access_p ? loop : NULL,
+     offset, &dummy, gsi,
+     simd_lane_access_p, loop_lens,
+     dr_info, memory_access_type);
  else
    dataref_ptr
      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -9949,6 +10012,13 @@ vectorizable_load (vec_info *vinfo,
   slp_node, &gs_info, &dataref_ptr,
   &vec_offsets);
    }
+   else if (loop_lens && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+    && memory_access_type != VMAT_INVARIANT)
+     dataref_ptr
+       = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+     at_loop, offset, &dummy, gsi,
+     simd_lane_access_p, loop_lens,
+     dr_info, memory_access_type);
  else
    dataref_ptr
      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fba09b9ffd3..97dd9ec06e8 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -825,6 +825,11 @@ public:
(b) can iterate more than once.  */
   bool using_decrementing_iv_p;
+  /* True if we've decided to use output of select_vl to adjust IV of
+     both loop control and data reference pointer. This is only true
+     for single-rgroup control.  */
+  bool using_select_vl_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -898,6 +903,7 @@ public:
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
+#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
-- 
2.36.1
  
Richard Sandiford June 4, 2023, 9:59 p.m. UTC | #2
Sorry for the slow review.

I don't know the IV-related parts well enough to review those properly,
but they looked reasonable to me.  Hopefully Richi can comment.

I'm curious though.  For:

> +  tree step = vect_dr_behavior (vinfo, dr_info)->step;
> +
> +  [...]
> +  poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
> +  /* Since the outcome of .SELECT_VL is element size, we should adjust
> +     it into bytesize so that it can be used in address pointer variable
> +     amount IVs adjustment.  */
> +  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
> +			  build_int_cst (len_type, bytesize));
> +  if (tree_int_cst_sgn (step) == -1)
> +    tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);

Could you not just multiply loop_len by step, probably written as:

  build_int_cst (len_type, wi::to_widest (step))

avoiding the NEGATE_EXPR and bytesize calculation?  step should
represent the step of the original scalar IV, so doing that feels
more direct.

The loop-control bits look good to me apart from one hunk:

> @@ -2737,6 +2738,14 @@ start_over:
>  			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
>      LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
>  
> +  /* If we're using decrement IV and SELECT_VL is supported by the target.
> +     Use output of SELECT_VL to adjust IV of loop control and data reference.
> +     Note: We only use SELECT_VL on single-rgroup control.  */
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +      && LOOP_VINFO_LENS (loop_vinfo).length () == 1
> +      && !slp)
> +    LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */

This test also needs to check that the target implements the select_vl
optab for the chosen iv type.  You can check that using
direct_internal_fn_supported_p.

We should also check that LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1,
since the IV update multiplies by the size in bytes.

I think it would be worth saying in more detail why we only use SELECT_VL
for single rgroups.  I assume the reason is to simplify the pointer IV
updates.  Is that right?

That is: the multiple length controls that are currently generated from
a MIN_EXPR IV should also work with a SELECT_VL IV.  The difficulty is
that an rgroup that controls N vector loads (say) would need N pointer
updates by variable amounts.  But I'm not 100% sure whether we're
avoiding that situation because it's difficult to code, or because
it's inefficient.  Or maybe we're avoiding it because it doesn't
fit well with the later RVV vsetvl pass.

Thanks,
Richard
  
juzhe.zhong@rivai.ai June 4, 2023, 10:47 p.m. UTC | #3
Hi, Richard. Thanks for the comments.

>> I think it would be worth saying in more detail why we only use SELECT_VL
>> for single rgroups.  I assume the reason is to simplify the pointer IV
>> updates.  Is that right?

Yes.

>> The difficulty is
>> that an rgroup that controls N vector loads (say) would need N pointer
>> updates by variable amounts.  But I'm not 100% sure whether we're
>> avoiding that situation because it's difficult to code, or because
>> it's inefficient.  Or maybe we're avoiding it because it doesn't
>> fit well with the later RVV vsetvl pass.
I don't want to use SELECT_VL pattern for multiple rgroups since
it's really ineffecient and also not fit well with latter RVV vsetvl PASS. 
(I have a draft in my downstream, the loop body becomes very ugly with a
 lot of instructions to adjust IVs).

Since we define SELECT_VL as a flexible pattern that doesn't have the side effect
set vector length, we need much more scalar operations to adjust the pointer IVs
and it cause more vsetvli than just using MIN.

Also, it changes a lot in middle-end and make middle-end codes too ugly and no
benefits I see so far.

Current approach (MIN), I think the current codegen is good (even though may not be perfect).

Besides, LLVM only can handle one vector length (in GCC, we call multiple rgroup).

I think RVV GCC is already in a good shape now in case of loop control.

I'd rather support all RVV auto-vectorization features soon and focus on optimizing loopVectorizer
(For example, I known GCC in TSVC has 46 fails, fixing failed vectorization case will improve much more,
 I think it can also improve ARM. ).

Address comments and will send the next patch, really appreciate it.

Thanks.  


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-06-05 05:59
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Add SELECT_VL support
Sorry for the slow review.
 
I don't know the IV-related parts well enough to review those properly,
but they looked reasonable to me.  Hopefully Richi can comment.
 
I'm curious though.  For:
 
> +  tree step = vect_dr_behavior (vinfo, dr_info)->step;
> +
> +  [...]
> +  poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
> +  /* Since the outcome of .SELECT_VL is element size, we should adjust
> +     it into bytesize so that it can be used in address pointer variable
> +     amount IVs adjustment.  */
> +  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
> +   build_int_cst (len_type, bytesize));
> +  if (tree_int_cst_sgn (step) == -1)
> +    tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
 
Could you not just multiply loop_len by step, probably written as:
 
  build_int_cst (len_type, wi::to_widest (step))
 
avoiding the NEGATE_EXPR and bytesize calculation?  step should
represent the step of the original scalar IV, so doing that feels
more direct.
 
The loop-control bits look good to me apart from one hunk:
 
> @@ -2737,6 +2738,14 @@ start_over:
>  LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
>      LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
>  
> +  /* If we're using decrement IV and SELECT_VL is supported by the target.
> +     Use output of SELECT_VL to adjust IV of loop control and data reference.
> +     Note: We only use SELECT_VL on single-rgroup control.  */
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +      && LOOP_VINFO_LENS (loop_vinfo).length () == 1
> +      && !slp)
> +    LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
 
This test also needs to check that the target implements the select_vl
optab for the chosen iv type.  You can check that using
direct_internal_fn_supported_p.
 
We should also check that LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1,
since the IV update multiplies by the size in bytes.
 
I think it would be worth saying in more detail why we only use SELECT_VL
for single rgroups.  I assume the reason is to simplify the pointer IV
updates.  Is that right?
 
That is: the multiple length controls that are currently generated from
a MIN_EXPR IV should also work with a SELECT_VL IV.  The difficulty is
that an rgroup that controls N vector loads (say) would need N pointer
updates by variable amounts.  But I'm not 100% sure whether we're
avoiding that situation because it's difficult to code, or because
it's inefficient.  Or maybe we're avoiding it because it doesn't
fit well with the later RVV vsetvl pass.
 
Thanks,
Richard
  

Patch

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6a435eb4461..95f7fe1f802 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4974,6 +4974,28 @@  for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
 @end smallexample
 
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of scalar iterations that should be handled
+by one iteration of a vector loop.  Operand 1 is the total number of
+scalar iterations that the loop needs to process and operand 2 is a
+maximum bound on the result (also known as the maximum ``vectorization
+factor'').
+
+The maximum value of operand 0 is given by:
+@smallexample
+operand0 = MIN (operand1, operand2)
+@end smallexample
+However, targets might choose a lower value than this, based on
+target-specific criteria.  Each iteration of the vector loop might
+therefore process a different number of scalar iterations, which in turn
+means that induction variables will have a variable step.  Because of
+this, it is generally not useful to define this instruction if it will
+always calculate the maximum value.
+
+This optab is only useful on targets that implement @samp{len_load_@var{m}}
+and/or @samp{len_store_@var{m}}.
+
 @cindex @code{check_raw_ptrs@var{m}} instruction pattern
 @item @samp{check_raw_ptrs@var{m}}
 Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..6f6fa7d37f9 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@  DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
 DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
 		       check_raw_ptrs, check_ptrs)
 DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..b637471b76e 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@  OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
 OPTAB_D (len_load_optab, "len_load_$a")
 OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index acf3642ceb2..3ca61663684 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -534,7 +534,7 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 	   _10 = (unsigned long) count_12(D);
 	   ...
 	   # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
-	   _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+	   _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 	   ...
 	   vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 	   ...
@@ -551,9 +551,14 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
       /* Create decrement IV.  */
       create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
 		 insert_after, &index_before_incr, &index_after_incr);
-      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
-							    index_before_incr,
-							    nitems_step));
+      tree len = NULL_TREE;
+      if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+	len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
+			    index_before_incr, nitems_step);
+      else
+	len = gimple_build (header_seq, MIN_EXPR, iv_type, index_before_incr,
+			    nitems_step);
+      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
       *iv_step = step;
       return index_after_incr;
     }
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5b7a0da0034..f67340976c8 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -974,6 +974,7 @@  _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
     using_decrementing_iv_p (false),
+    using_select_vl_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2737,6 +2738,14 @@  start_over:
 			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
 
+  /* If we're using decrement IV and SELECT_VL is supported by the target.
+     Use output of SELECT_VL to adjust IV of loop control and data reference.
+     Note: We only use SELECT_VL on single-rgroup control.  */
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+      && LOOP_VINFO_LENS (loop_vinfo).length () == 1
+      && !slp)
+    LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 127b987cd62..8e8b0f71a4a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3147,6 +3147,61 @@  vect_get_data_ptr_increment (vec_info *vinfo,
   return iv_step;
 }
 
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+   Such variable amount is the outcome of .SELECT_VL. In this case, we can
+   allow each iteration process the flexible number of elements as long as
+   the number <= vf elments.
+
+   Return data reference according to SELECT_VL.
+   If new statements are needed, insert them before GSI.  */
+
+static tree
+get_select_vl_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
+			    tree aggr_type, class loop *at_loop, tree offset,
+			    tree *dummy, gimple_stmt_iterator *gsi,
+			    bool simd_lane_access_p, vec_loop_lens *loop_lens,
+			    dr_vec_info *dr_info,
+			    vect_memory_access_type memory_access_type)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  tree step = vect_dr_behavior (vinfo, dr_info)->step;
+
+  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
+     IVs are updated by variable amount but we will support them in the future.
+   */
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+	      && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+  /* When we support SELECT_VL pattern, we dynamic adjust
+     the memory address by .SELECT_VL result.
+
+     The result of .SELECT_VL is the number of elements to
+     be processed of each iteration. So the memory address
+     adjustment operation should be:
+
+     bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+     addr = addr + .SELECT_VL (ARG..) * bytesize;
+  */
+  gimple *ptr_incr;
+  tree loop_len
+    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
+  tree len_type = TREE_TYPE (loop_len);
+  poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+  /* Since the outcome of .SELECT_VL is element size, we should adjust
+     it into bytesize so that it can be used in address pointer variable
+     amount IVs adjustment.  */
+  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+			  build_int_cst (len_type, bytesize));
+  if (tree_int_cst_sgn (step) == -1)
+    tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
+  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+  gassign *assign = gimple_build_assign (bump, tmp);
+  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+  return vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop, offset,
+				   dummy, gsi, &ptr_incr, simd_lane_access_p,
+				   bump);
+}
+
 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}.  */
 
 static bool
@@ -8558,6 +8613,14 @@  vectorizable_store (vec_info *vinfo,
 	    vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
 					 slp_node, &gs_info, &dataref_ptr,
 					 &vec_offsets);
+	  else if (loop_lens && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+		   && memory_access_type != VMAT_INVARIANT)
+	    dataref_ptr
+	      = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+					    simd_lane_access_p ? loop : NULL,
+					    offset, &dummy, gsi,
+					    simd_lane_access_p, loop_lens,
+					    dr_info, memory_access_type);
 	  else
 	    dataref_ptr
 	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -9949,6 +10012,13 @@  vectorizable_load (vec_info *vinfo,
 					   slp_node, &gs_info, &dataref_ptr,
 					   &vec_offsets);
 	    }
+	  else if (loop_lens && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+		   && memory_access_type != VMAT_INVARIANT)
+	    dataref_ptr
+	      = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+					    at_loop, offset, &dummy, gsi,
+					    simd_lane_access_p, loop_lens,
+					    dr_info, memory_access_type);
 	  else
 	    dataref_ptr
 	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fba09b9ffd3..97dd9ec06e8 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -825,6 +825,11 @@  public:
 	(b) can iterate more than once.  */
   bool using_decrementing_iv_p;
 
+  /* True if we've decided to use output of select_vl to adjust IV of
+     both loop control and data reference pointer. This is only true
+     for single-rgroup control.  */
+  bool using_select_vl_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -898,6 +903,7 @@  public:
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
 #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
+#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias