RISC-V: Apply vla vs. vls mode heuristic vector COST model

Message ID 20231212142552.102285-1-juzhe.zhong@rivai.ai
State Committed
Commit 8501edba91ea63bdfc045f1cb66fb1c242e44e80
Delegated to: Robin Dapp
Headers
Series RISC-V: Apply vla vs. vls mode heuristic vector COST model |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-lint warning Lint failed
rivoscibot/toolchain-ci-rivos-apply-patch fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 fail Patch failed to apply

Commit Message

juzhe.zhong@rivai.ai Dec. 12, 2023, 2:25 p.m. UTC
  This patch apply vla vs. vls mode heuristic which can fixes the following FAILs:
FAIL: gcc.target/riscv/rvv/autovec/pr111751.c -O3 -ftree-vectorize
scan-assembler-not vset
FAIL: gcc.target/riscv/rvv/autovec/pr111751.c -O3 -ftree-vectorize
scan-assembler-times li\\s+[a-x0-9]+,0\\s+ret 2

The root cause of this FAIL is we failed to pick VLS mode for the vectorization.

Before this patch:

foo2:
        addi    sp,sp,-208
        addi    a2,sp,64
        addi    a5,sp,128
        lui     a6,%hi(.LANCHOR0)
        sd      ra,200(sp)
        addi    a6,a6,%lo(.LANCHOR0)
        mv      a0,a2
        mv      a1,a5
        li      a3,16
        mv      a4,sp
        vsetivli        zero,8,e64,m8,ta,ma
        vle64.v v8,0(a6)
        vse64.v v8,0(a2)
        vse64.v v8,0(a5)
.L4:
        vsetvli a5,a3,e32,m1,ta,ma
        slli    a2,a5,2
        vle32.v v2,0(a1)
        vle32.v v1,0(a0)
        sub     a3,a3,a5
        vadd.vv v1,v1,v2
        vse32.v v1,0(a4)
        add     a1,a1,a2
        add     a0,a0,a2
        add     a4,a4,a2
        bne     a3,zero,.L4
        lw      a4,128(sp)
        lw      a5,64(sp)
        addw    a5,a5,a4
        lw      a4,0(sp)
        bne     a4,a5,.L5
        lw      a4,132(sp)
        lw      a5,68(sp)
        addw    a5,a5,a4
        lw      a4,4(sp)
        bne     a4,a5,.L5
        lw      a4,136(sp)
        lw      a5,72(sp)
        addw    a5,a5,a4
        lw      a4,8(sp)
        bne     a4,a5,.L5
        lw      a4,140(sp)
        lw      a5,76(sp)
        addw    a5,a5,a4
        lw      a4,12(sp)
        bne     a4,a5,.L5
        lw      a4,144(sp)
        lw      a5,80(sp)
        addw    a5,a5,a4
        lw      a4,16(sp)
        bne     a4,a5,.L5
        lw      a4,148(sp)
        lw      a5,84(sp)
        addw    a5,a5,a4
        lw      a4,20(sp)
        bne     a4,a5,.L5
        lw      a4,152(sp)
        lw      a5,88(sp)
        addw    a5,a5,a4
        lw      a4,24(sp)
        bne     a4,a5,.L5
        lw      a4,156(sp)
        lw      a5,92(sp)
        addw    a5,a5,a4
        lw      a4,28(sp)
        bne     a4,a5,.L5
        lw      a4,160(sp)
        lw      a5,96(sp)
        addw    a5,a5,a4
        lw      a4,32(sp)
        bne     a4,a5,.L5
        lw      a4,164(sp)
        lw      a5,100(sp)
        addw    a5,a5,a4
        lw      a4,36(sp)
        bne     a4,a5,.L5
        lw      a4,168(sp)
        lw      a5,104(sp)
        addw    a5,a5,a4
        lw      a4,40(sp)
        bne     a4,a5,.L5
        lw      a4,172(sp)
        lw      a5,108(sp)
        addw    a5,a5,a4
        lw      a4,44(sp)
        bne     a4,a5,.L5
        lw      a4,176(sp)
        lw      a5,112(sp)
        addw    a5,a5,a4
        lw      a4,48(sp)
        bne     a4,a5,.L5
        lw      a4,180(sp)
        lw      a5,116(sp)
        addw    a5,a5,a4
        lw      a4,52(sp)
        bne     a4,a5,.L5
        lw      a4,184(sp)
        lw      a5,120(sp)
        addw    a5,a5,a4
        lw      a4,56(sp)
        bne     a4,a5,.L5
        lw      a4,188(sp)
        lw      a5,124(sp)
        addw    a5,a5,a4
        lw      a4,60(sp)
        bne     a4,a5,.L5
        ld      ra,200(sp)
        li      a0,0
        addi    sp,sp,208
        jr      ra
.L5:
        call    abort

After this patch:

        li      a0,0
        ret

The heuristic leverage ARM SVE and fully tested and confirm we have same behavior
as ARM SVE GCC and RVV Clang.

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (costs::analyze_loop_vinfo): New function.
	(costs::record_potential_vls_unrolling): Ditto.
	(costs::prefer_unrolled_loop): Ditto.
	(costs::better_main_loop_than_p): Ditto.
	(costs::add_stmt_cost): Ditto.
	* config/riscv/riscv-vector-costs.h (enum cost_type_enum): New enum.
	* config/riscv/t-riscv: Add new include files.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/pr111313.c: Adapt test.
	* gcc.target/riscv/rvv/autovec/vls/shift-3.c: Ditto.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc        | 134 +++++++++++++++++-
 gcc/config/riscv/riscv-vector-costs.h         |  43 ++++++
 gcc/config/riscv/t-riscv                      |   2 +-
 .../vect/costmodel/riscv/rvv/vla_vs_vls-1.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-10.c  |  28 ++++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-11.c  |  28 ++++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-12.c  |  28 ++++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-2.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-3.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-4.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-5.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-6.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-7.c   |  13 ++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-8.c   |  27 ++++
 .../vect/costmodel/riscv/rvv/vla_vs_vls-9.c   |  27 ++++
 .../gcc.target/riscv/rvv/autovec/pr111313.c   |   2 +-
 .../riscv/rvv/autovec/vls/shift-3.c           |   2 +-
 17 files changed, 408 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c
  

Comments

Robin Dapp Dec. 12, 2023, 9:43 p.m. UTC | #1
Given that it's almost verbatim aarch64's implementation and the
general approach appears sensible, LGTM.

Regards
 Robin
  

Patch

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 3fcb5f3176f..7888cef58fe 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -41,6 +41,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "ssa.h"
 #include "backend.h"
 #include "tree-data-ref.h"
+#include "tree-ssa-loop-niter.h"
 
 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -601,7 +602,101 @@  preferred_new_lmul_p (loop_vec_info other_loop_vinfo)
 
 costs::costs (vec_info *vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar)
-{}
+{
+  if (costing_for_scalar)
+    m_cost_type = SCALAR_COST;
+  else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
+    m_cost_type = VLA_VECTOR_COST;
+  else
+    m_cost_type = VLS_VECTOR_COST;
+}
+
+/* Do one-time initialization of the costs given that we're
+   costing the loop vectorization described by LOOP_VINFO.  */
+void
+costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
+{
+  /* Record the number of times that the vector loop would execute,
+     if known.  */
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  auto scalar_niters = max_stmt_executions_int (loop);
+  if (scalar_niters >= 0)
+    {
+      unsigned int vf = vect_vf_for_cost (loop_vinfo);
+      if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+	m_num_vector_iterations = scalar_niters / vf;
+      else
+	m_num_vector_iterations = CEIL (scalar_niters, vf);
+    }
+
+  /* Detect whether we're vectorizing for VLA and should apply the unrolling
+     heuristic described above m_unrolled_vls_niters.  */
+  record_potential_vls_unrolling (loop_vinfo);
+}
+
+/* Decide whether to use the unrolling heuristic described above
+   m_unrolled_vls_niters, updating that field if so.  LOOP_VINFO
+   describes the loop that we're vectorizing.  */
+void
+costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
+{
+  /* We only want to apply the heuristic if LOOP_VINFO is being
+     vectorized for VLA.  */
+  if (m_cost_type != VLA_VECTOR_COST)
+    return;
+
+  /* We don't want to apply the heuristic to outer loops, since it's
+     harder to track two levels of unrolling.  */
+  if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
+    return;
+
+  /* Only handle cases in which the number of VLS iterations
+     would be known at compile time but the number of SVE iterations
+     would not.  */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      || BYTES_PER_RISCV_VECTOR.is_constant ())
+    return;
+
+  /* Guess how many times the VLS loop would iterate and make
+     sure that it is within the complete unrolling limit.  Even if the
+     number of iterations is small enough, the number of statements might
+     not be, which is why we need to estimate the number of statements too.  */
+  unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
+  unsigned HOST_WIDE_INT unrolled_vls_niters
+    = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
+  if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
+    return;
+
+  /* Record that we're applying the heuristic and should try to estimate
+     the number of statements in the VLS loop.  */
+  m_unrolled_vls_niters = unrolled_vls_niters;
+}
+
+/* Return true if (a) we're applying the VLS vs. VLA unrolling
+   heuristic described above m_unrolled_vls_niters and (b) the heuristic
+   says that we should prefer the VLS loop.  */
+bool
+costs::prefer_unrolled_loop () const
+{
+  if (!m_unrolled_vls_stmts)
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Number of insns in"
+		     " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
+		     m_unrolled_vls_stmts);
+
+  /* The balance here is tricky.  On the one hand, we can't be sure whether
+     the code is vectorizable with VLS or not.  However, even if
+     it isn't vectorizable with VLS, there's a possibility that
+     the scalar code could also be unrolled.  Some of the code might then
+     benefit from SLP, or from using LDP and STP.  We therefore apply
+     the heuristic regardless of can_use_vls_p.  */
+  return (m_unrolled_vls_stmts
+	  && (m_unrolled_vls_stmts
+	      <= (unsigned int) param_max_completely_peeled_insns));
+}
 
 bool
 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
@@ -618,6 +713,21 @@  costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 		     GET_MODE_NAME (other_loop_vinfo->vector_mode),
 		     vect_vf_for_cost (other_loop_vinfo));
 
+  /* Apply the unrolling heuristic described above m_unrolled_vls_niters.  */
+  if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
+    {
+      bool this_prefer_unrolled = this->prefer_unrolled_loop ();
+      bool other_prefer_unrolled = other->prefer_unrolled_loop ();
+      if (this_prefer_unrolled != other_prefer_unrolled)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Preferring VLS loop because"
+			     " it can be unrolled\n");
+	  return other_prefer_unrolled;
+	}
+    }
+
   if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
       && riscv_autovec_lmul == RVV_DYNAMIC)
     {
@@ -643,6 +753,28 @@  costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   /* TODO: Use default STMT cost model.
 	   We will support more accurate STMT cost model later.  */
   int stmt_cost = default_builtin_vectorization_cost (kind, vectype, misalign);
+
+  /* Do one-time initialization based on the vinfo.  */
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
+  if (!m_analyzed_vinfo)
+    {
+      if (loop_vinfo)
+	analyze_loop_vinfo (loop_vinfo);
+
+      m_analyzed_vinfo = true;
+    }
+
+  if (stmt_info)
+    {
+      /* If we're applying the VLA vs. VLS unrolling heuristic,
+	 estimate the number of statements in the unrolled VLS
+	 loop.  For simplicitly, we assume that one iteration of the
+	 VLS loop would need the same number of statements
+	 as one iteration of the VLA loop.  */
+      if (where == vect_body && m_unrolled_vls_niters)
+	m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+    }
+
   return record_stmt_cost (stmt_info, where, count * stmt_cost);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h
index e18775e230b..ff294a60aaf 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -30,6 +30,13 @@  struct stmt_point
   gimple *stmt;
 };
 
+enum cost_type_enum
+{
+  SCALAR_COST,
+  VLA_VECTOR_COST,
+  VLS_VECTOR_COST
+};
+
 /* Pair typedef used by live range: <start, end>.  */
 typedef std::pair<unsigned int, unsigned int> pair;
 
@@ -49,6 +56,42 @@  private:
 			      tree vectype, int misalign,
 			      vect_cost_model_location where) override;
   void finish_cost (const vector_costs *) override;
+
+  /* True if we have performed one-time initialization based on the
+     vec_info.  */
+  bool m_analyzed_vinfo = false;
+
+  /* - If M_COST_TYPE = SCALAR_COST then we're costing the original scalar code.
+     - If M_COST_TYPE = VLA_VECTOR_COST is nonzero then we're costing VLA
+       partial vectorization codes.
+     - If M_COST_TYPE = VLS_VECTOR_COST is nonzero then we're costing VLS
+       minimum length vector codes.  */
+  enum cost_type_enum m_cost_type;
+
+  /* On some CPUs, VLA and VLS provide the same theoretical vector
+     throughput, such as 4x128 VLS vs. 2x256 VLA.  In those
+     situations, we try to predict whether an VLS implementation
+     of the loop could be completely unrolled and become straight-line code.
+     If so, it is generally better to use the VLS version rather
+     than length-agnostic VLA, since the VLA loop would execute an unknown
+     number of times and so could not be completely unrolled in the same way.
+
+     If we're applying this heuristic, M_UNROLLED_VLS_NITERS is the
+     number of VLS loop iterations that would be unrolled and
+     M_UNROLLED_VLS_STMTS estimates the total number of statements
+     in the unrolled loop.  Both values are zero if we're not applying
+     the heuristic.  */
+  unsigned HOST_WIDE_INT m_unrolled_vls_niters = 0;
+  unsigned HOST_WIDE_INT m_unrolled_vls_stmts = 0;
+
+  /* If we're vectorizing a loop that executes a constant number of times,
+     this variable gives the number of times that the vector loop would
+     iterate, otherwise it is zero.  */
+  uint64_t m_num_vector_iterations = 0;
+
+  void analyze_loop_vinfo (loop_vec_info);
+  void record_potential_vls_unrolling (loop_vec_info);
+  bool prefer_unrolled_loop () const;
 };
 
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 1aac8b58bb5..2b2ec409525 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -74,7 +74,7 @@  riscv-vector-costs.o: $(srcdir)/config/riscv/riscv-vector-costs.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H) $(FUNCTION_H) \
   $(TREE_H) basic-block.h $(RTL_H) gimple.h targhooks.h cfgloop.h \
   fold-const.h $(TM_P_H) tree-vectorizer.h gimple-iterator.h bitmap.h \
-  ssa.h backend.h \
+  ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h \
   $(srcdir)/config/riscv/riscv-vector-costs.h
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/riscv/riscv-vector-costs.cc
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c
new file mode 100644
index 00000000000..1ef4215e72c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
new file mode 100644
index 00000000000..3ddffa37fe4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m4 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
new file mode 100644
index 00000000000..7625ec5c4b1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
new file mode 100644
index 00000000000..7625ec5c4b1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c
new file mode 100644
index 00000000000..ca203f50847
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m2" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c
new file mode 100644
index 00000000000..f8e53350785
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m4" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c
new file mode 100644
index 00000000000..4859d570c0c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c
new file mode 100644
index 00000000000..8a568028bcf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=dynamic" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c
new file mode 100644
index 00000000000..46ebd5fd49b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 32; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetivli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c
new file mode 100644
index 00000000000..f5aceca32d7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=dynamic" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 32; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetivli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c
new file mode 100644
index 00000000000..ea6a7cbe2b1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c
new file mode 100644
index 00000000000..7f03cb9ecbe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr111313.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr111313.c
index 1e01cfefd47..a4f8c37f95d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr111313.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr111313.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -O3 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -O3 -fno-schedule-insns -fno-schedule-insns2 -fno-vect-cost-model" } */
 
 #define K 32
 short in[2*K][K];
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
index e25e7b59c3e..8de1b9c0c41 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
@@ -53,5 +53,5 @@  DEF_OP_VV (shift, 128, int64_t, <<)
 DEF_OP_VV (shift, 256, int64_t, <<)
 DEF_OP_VV (shift, 512, int64_t, <<)
 
-/* { dg-final { scan-assembler-times {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 43 } } */
+/* { dg-final { scan-assembler-times {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 46 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */