RISC-V: VLA preempts VLS on unknown NITERS loop

Message ID 20240111024223.264227-1-juzhe.zhong@rivai.ai
State Committed
Commit 1a51886a79c97e53ba945c1926ab215ed6a9f5ed
Headers
Series RISC-V: VLA preempts VLS on unknown NITERS loop |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-lint success Lint passed
rivoscibot/toolchain-ci-rivos-apply-patch success Patch applied
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gc-lp64d-multilib success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
rivoscibot/toolchain-ci-rivos-build--linux-rv32gc_zba_zbb_zbc_zbs-ilp32d-non-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-non-multilib success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 warning Patch is already merged
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-test success Testing passed

Commit Message

juzhe.zhong@rivai.ai Jan. 11, 2024, 2:42 a.m. UTC
  This patch fixes the known issues on SLP cases:

	ble	a2,zero,.L11
	addiw	t1,a2,-1
	li	a5,15
	bleu	t1,a5,.L9
	srliw	a7,t1,4
	slli	a7,a7,7
	lui	t3,%hi(.LANCHOR0)
	lui	a6,%hi(.LANCHOR0+128)
	addi	t3,t3,%lo(.LANCHOR0)
	li	a4,128
	addi	a6,a6,%lo(.LANCHOR0+128)
	add	a7,a7,a0
	addi	a3,a1,37
	mv	a5,a0
	vsetvli	zero,a4,e8,m8,ta,ma
	vle8.v	v24,0(t3)
	vle8.v	v16,0(a6)
.L4:
	li	a6,128
	vle8.v	v0,0(a3)
	vrgather.vv	v8,v0,v24
	vadd.vv	v8,v8,v16
	vse8.v	v8,0(a5)
	add	a5,a5,a6
	add	a3,a3,a6
	bne	a5,a7,.L4
	andi	a5,t1,-16
	mv	t1,a5
.L3:
	subw	a2,a2,a5
	li	a4,1
	beq	a2,a4,.L5
	slli	a5,a5,32
	srli	a5,a5,32
	addiw	a2,a2,-1
	slli	a5,a5,3
	csrr	a4,vlenb
	slli	a6,a2,32
	addi	t3,a5,37
	srli	a3,a6,29
	slli	a4,a4,2
	add	t3,a1,t3
	add	a5,a0,a5
	mv	t5,a3
	bgtu	a3,a4,.L14
.L6:
	li	a4,50790400
	addi	a4,a4,1541
	li	a6,67633152
	addi	a6,a6,513
	slli	a4,a4,32
	add	a4,a4,a6
	vsetvli	t4,zero,e64,m4,ta,ma
	vmv.v.x	v16,a4
	vsetvli	a6,zero,e16,m8,ta,ma
	vid.v	v8
	vsetvli	zero,t5,e8,m4,ta,ma
	vle8.v	v20,0(t3)
	vsetvli	a6,zero,e16,m8,ta,ma
	csrr	a7,vlenb
	vand.vi	v8,v8,-8
	vsetvli	zero,zero,e8,m4,ta,ma
	slli	a4,a7,2
	vrgatherei16.vv	v4,v20,v8
	vadd.vv	v4,v4,v16
	vsetvli	zero,t5,e8,m4,ta,ma
	vse8.v	v4,0(a5)
	bgtu	a3,a4,.L15
.L7:
	addw	t1,a2,t1
.L5:
	slliw	a5,t1,3
	add	a1,a1,a5
	lui	a4,%hi(.LC2)
	add	a0,a0,a5
	lbu	a3,37(a1)
	addi	a5,a4,%lo(.LC2)
	vsetivli	zero,8,e8,mf2,ta,ma
	vmv.v.x	v1,a3
	vle8.v	v2,0(a5)
	vadd.vv	v1,v1,v2
	vse8.v	v1,0(a0)
.L11:
	ret
.L15:
	sub	a3,a3,a4
	bleu	a3,a4,.L8
	mv	a3,a4
.L8:
	li	a7,50790400
	csrr	a4,vlenb
	slli	a4,a4,2
	addi	a7,a7,1541
	li	t4,67633152
	add	t3,t3,a4
	vsetvli	zero,a3,e8,m4,ta,ma
	slli	a7,a7,32
	addi	t4,t4,513
	vle8.v	v20,0(t3)
	add	a4,a5,a4
	add	a7,a7,t4
	vsetvli	a5,zero,e64,m4,ta,ma
	vmv.v.x	v16,a7
	vsetvli	a6,zero,e16,m8,ta,ma
	vid.v	v8
	vand.vi	v8,v8,-8
	vsetvli	zero,zero,e8,m4,ta,ma
	vrgatherei16.vv	v4,v20,v8
	vadd.vv	v4,v4,v16
	vsetvli	zero,a3,e8,m4,ta,ma
	vse8.v	v4,0(a4)
	j	.L7
.L14:
	mv	t5,a4
	j	.L6
.L9:
	li	a5,0
	li	t1,0
	j	.L3

The vectorization codegen is quite inefficient since we choose a VLS modes to vectorize the loop body
with epilogue choosing a VLA modes.

cost.c:6:21: note:  ***** Choosing vector mode V128QI
cost.c:6:21: note:  ***** Choosing epilogue vector mode RVVM4QI

As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support partial vectors wheras
VLSmodes support full vectors.  The goal we add VLSmodes is to improve the codegen of known NITERS
or SLP codes.

If NITERS is unknown, that is i < n, n is unknown. We will always have partial vectors vectorization.
It can be loop body or epilogue. In this case, It's always more efficient to apply VLA partial vectorization
on loop body which doesn't have epilogue.

After this patch:

f:
	ble	a2,zero,.L7
	li	a5,1
	beq	a2,a5,.L5
	li	a6,50790400
	addi	a6,a6,1541
	li	a4,67633152
	addi	a4,a4,513
	csrr	a5,vlenb
	addiw	a2,a2,-1
	slli	a6,a6,32
	add	a6,a6,a4
	slli	a5,a5,2
	slli	a4,a2,32
	vsetvli	t1,zero,e64,m4,ta,ma
	srli	a3,a4,29
	neg	t4,a5
	addi	a7,a1,37
	mv	a4,a0
	vmv.v.x	v12,a6
	vsetvli	t3,zero,e16,m8,ta,ma
	vid.v	v16
	vand.vi	v16,v16,-8
.L4:
	minu	a6,a3,a5
	vsetvli	zero,a6,e8,m4,ta,ma
	vle8.v	v8,0(a7)
	vsetvli	t3,zero,e8,m4,ta,ma
	mv	t1,a3
	vrgatherei16.vv	v4,v8,v16
	vsetvli	zero,a6,e8,m4,ta,ma
	vadd.vv	v4,v4,v12
	vse8.v	v4,0(a4)
	add	a7,a7,a5
	add	a4,a4,a5
	add	a3,a3,t4
	bgtu	t1,a5,.L4
.L3:
	slliw	a2,a2,3
	add	a1,a1,a2
	lui	a5,%hi(.LC0)
	lbu	a4,37(a1)
	add	a0,a0,a2
	addi	a5,a5,%lo(.LC0)
	vsetivli	zero,8,e8,mf2,ta,ma
	vmv.v.x	v1,a4
	vle8.v	v2,0(a5)
	vadd.vv	v1,v1,v2
	vse8.v	v1,0(a0)
.L7:
	ret

Tested on both RV32 and RV64 no regression. Ok for trunk ?

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): VLA preempt VLS on unknown NITERS loop.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/partial/slp-1.c: Remove xfail.
	* gcc.target/riscv/rvv/autovec/partial/slp-16.c: Ditto.
	* gcc.target/riscv/rvv/autovec/partial/slp-3.c: Ditto.
	* gcc.target/riscv/rvv/autovec/partial/slp-5.c: Ditto.

---
 gcc/config/riscv/riscv-vector-costs.cc                   | 9 +++++++++
 .../gcc.target/riscv/rvv/autovec/partial/slp-1.c         | 2 +-
 .../gcc.target/riscv/rvv/autovec/partial/slp-16.c        | 2 +-
 .../gcc.target/riscv/rvv/autovec/partial/slp-3.c         | 2 +-
 .../gcc.target/riscv/rvv/autovec/partial/slp-5.c         | 2 +-
 5 files changed, 13 insertions(+), 4 deletions(-)
  

Comments

Kito Cheng Jan. 11, 2024, 5:56 a.m. UTC | #1
The idea makes sense to me, LGTM :)

On Thu, Jan 11, 2024 at 10:43 AM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> This patch fixes the known issues on SLP cases:
>
>         ble     a2,zero,.L11
>         addiw   t1,a2,-1
>         li      a5,15
>         bleu    t1,a5,.L9
>         srliw   a7,t1,4
>         slli    a7,a7,7
>         lui     t3,%hi(.LANCHOR0)
>         lui     a6,%hi(.LANCHOR0+128)
>         addi    t3,t3,%lo(.LANCHOR0)
>         li      a4,128
>         addi    a6,a6,%lo(.LANCHOR0+128)
>         add     a7,a7,a0
>         addi    a3,a1,37
>         mv      a5,a0
>         vsetvli zero,a4,e8,m8,ta,ma
>         vle8.v  v24,0(t3)
>         vle8.v  v16,0(a6)
> .L4:
>         li      a6,128
>         vle8.v  v0,0(a3)
>         vrgather.vv     v8,v0,v24
>         vadd.vv v8,v8,v16
>         vse8.v  v8,0(a5)
>         add     a5,a5,a6
>         add     a3,a3,a6
>         bne     a5,a7,.L4
>         andi    a5,t1,-16
>         mv      t1,a5
> .L3:
>         subw    a2,a2,a5
>         li      a4,1
>         beq     a2,a4,.L5
>         slli    a5,a5,32
>         srli    a5,a5,32
>         addiw   a2,a2,-1
>         slli    a5,a5,3
>         csrr    a4,vlenb
>         slli    a6,a2,32
>         addi    t3,a5,37
>         srli    a3,a6,29
>         slli    a4,a4,2
>         add     t3,a1,t3
>         add     a5,a0,a5
>         mv      t5,a3
>         bgtu    a3,a4,.L14
> .L6:
>         li      a4,50790400
>         addi    a4,a4,1541
>         li      a6,67633152
>         addi    a6,a6,513
>         slli    a4,a4,32
>         add     a4,a4,a6
>         vsetvli t4,zero,e64,m4,ta,ma
>         vmv.v.x v16,a4
>         vsetvli a6,zero,e16,m8,ta,ma
>         vid.v   v8
>         vsetvli zero,t5,e8,m4,ta,ma
>         vle8.v  v20,0(t3)
>         vsetvli a6,zero,e16,m8,ta,ma
>         csrr    a7,vlenb
>         vand.vi v8,v8,-8
>         vsetvli zero,zero,e8,m4,ta,ma
>         slli    a4,a7,2
>         vrgatherei16.vv v4,v20,v8
>         vadd.vv v4,v4,v16
>         vsetvli zero,t5,e8,m4,ta,ma
>         vse8.v  v4,0(a5)
>         bgtu    a3,a4,.L15
> .L7:
>         addw    t1,a2,t1
> .L5:
>         slliw   a5,t1,3
>         add     a1,a1,a5
>         lui     a4,%hi(.LC2)
>         add     a0,a0,a5
>         lbu     a3,37(a1)
>         addi    a5,a4,%lo(.LC2)
>         vsetivli        zero,8,e8,mf2,ta,ma
>         vmv.v.x v1,a3
>         vle8.v  v2,0(a5)
>         vadd.vv v1,v1,v2
>         vse8.v  v1,0(a0)
> .L11:
>         ret
> .L15:
>         sub     a3,a3,a4
>         bleu    a3,a4,.L8
>         mv      a3,a4
> .L8:
>         li      a7,50790400
>         csrr    a4,vlenb
>         slli    a4,a4,2
>         addi    a7,a7,1541
>         li      t4,67633152
>         add     t3,t3,a4
>         vsetvli zero,a3,e8,m4,ta,ma
>         slli    a7,a7,32
>         addi    t4,t4,513
>         vle8.v  v20,0(t3)
>         add     a4,a5,a4
>         add     a7,a7,t4
>         vsetvli a5,zero,e64,m4,ta,ma
>         vmv.v.x v16,a7
>         vsetvli a6,zero,e16,m8,ta,ma
>         vid.v   v8
>         vand.vi v8,v8,-8
>         vsetvli zero,zero,e8,m4,ta,ma
>         vrgatherei16.vv v4,v20,v8
>         vadd.vv v4,v4,v16
>         vsetvli zero,a3,e8,m4,ta,ma
>         vse8.v  v4,0(a4)
>         j       .L7
> .L14:
>         mv      t5,a4
>         j       .L6
> .L9:
>         li      a5,0
>         li      t1,0
>         j       .L3
>
> The vectorization codegen is quite inefficient since we choose a VLS modes to vectorize the loop body
> with epilogue choosing a VLA modes.
>
> cost.c:6:21: note:  ***** Choosing vector mode V128QI
> cost.c:6:21: note:  ***** Choosing epilogue vector mode RVVM4QI
>
> As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support partial vectors wheras
> VLSmodes support full vectors.  The goal we add VLSmodes is to improve the codegen of known NITERS
> or SLP codes.
>
> If NITERS is unknown, that is i < n, n is unknown. We will always have partial vectors vectorization.
> It can be loop body or epilogue. In this case, It's always more efficient to apply VLA partial vectorization
> on loop body which doesn't have epilogue.
>
> After this patch:
>
> f:
>         ble     a2,zero,.L7
>         li      a5,1
>         beq     a2,a5,.L5
>         li      a6,50790400
>         addi    a6,a6,1541
>         li      a4,67633152
>         addi    a4,a4,513
>         csrr    a5,vlenb
>         addiw   a2,a2,-1
>         slli    a6,a6,32
>         add     a6,a6,a4
>         slli    a5,a5,2
>         slli    a4,a2,32
>         vsetvli t1,zero,e64,m4,ta,ma
>         srli    a3,a4,29
>         neg     t4,a5
>         addi    a7,a1,37
>         mv      a4,a0
>         vmv.v.x v12,a6
>         vsetvli t3,zero,e16,m8,ta,ma
>         vid.v   v16
>         vand.vi v16,v16,-8
> .L4:
>         minu    a6,a3,a5
>         vsetvli zero,a6,e8,m4,ta,ma
>         vle8.v  v8,0(a7)
>         vsetvli t3,zero,e8,m4,ta,ma
>         mv      t1,a3
>         vrgatherei16.vv v4,v8,v16
>         vsetvli zero,a6,e8,m4,ta,ma
>         vadd.vv v4,v4,v12
>         vse8.v  v4,0(a4)
>         add     a7,a7,a5
>         add     a4,a4,a5
>         add     a3,a3,t4
>         bgtu    t1,a5,.L4
> .L3:
>         slliw   a2,a2,3
>         add     a1,a1,a2
>         lui     a5,%hi(.LC0)
>         lbu     a4,37(a1)
>         add     a0,a0,a2
>         addi    a5,a5,%lo(.LC0)
>         vsetivli        zero,8,e8,mf2,ta,ma
>         vmv.v.x v1,a4
>         vle8.v  v2,0(a5)
>         vadd.vv v1,v1,v2
>         vse8.v  v1,0(a0)
> .L7:
>         ret
>
> Tested on both RV32 and RV64 no regression. Ok for trunk ?
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): VLA preempt VLS on unknown NITERS loop.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/partial/slp-1.c: Remove xfail.
>         * gcc.target/riscv/rvv/autovec/partial/slp-16.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/partial/slp-3.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/partial/slp-5.c: Ditto.
>
> ---
>  gcc/config/riscv/riscv-vector-costs.cc                   | 9 +++++++++
>  .../gcc.target/riscv/rvv/autovec/partial/slp-1.c         | 2 +-
>  .../gcc.target/riscv/rvv/autovec/partial/slp-16.c        | 2 +-
>  .../gcc.target/riscv/rvv/autovec/partial/slp-3.c         | 2 +-
>  .../gcc.target/riscv/rvv/autovec/partial/slp-5.c         | 2 +-
>  5 files changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
> index e53f4a186f3..58ec0b9b503 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -1042,6 +1042,15 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
>             }
>         }
>      }
> +  /* If NITERS is unknown, we should not use VLS modes to vectorize
> +     the loop since we don't support partial vectors for VLS modes,
> +     that is, we will have full vectors (VLSmodes) on loop body
> +     and partial vectors (VLAmodes) on loop epilogue which is very
> +     inefficient.  Instead, we should apply partial vectors (VLAmodes)
> +     on loop body without an epilogue on unknown NITERS loop.  */
> +  else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
> +          && m_cost_type == VLS_VECTOR_COST)
> +    return false;
>
>    return vector_costs::better_main_loop_than_p (other);
>  }
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> index 948b20b68d3..0a1d1f72e6b 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> @@ -21,6 +21,6 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
>  /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
>  /* { dg-final { scan-assembler {\tvand} { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
> index 7b23cafab3f..05220c32c5d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
> @@ -21,6 +21,6 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
>  /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param riscv-autovec-lmul=m1"} } } } */
>  /* { dg-final { scan-assembler-not {\tvmul} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> index 3622c59c439..5e64231b37d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> @@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> index 5c0a6775474..c78b3709078 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> @@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
> --
> 2.36.3
>
  

Patch

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index e53f4a186f3..58ec0b9b503 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1042,6 +1042,15 @@  costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 	    }
 	}
     }
+  /* If NITERS is unknown, we should not use VLS modes to vectorize
+     the loop since we don't support partial vectors for VLS modes,
+     that is, we will have full vectors (VLSmodes) on loop body
+     and partial vectors (VLAmodes) on loop epilogue which is very
+     inefficient.  Instead, we should apply partial vectors (VLAmodes)
+     on loop body without an epilogue on unknown NITERS loop.  */
+  else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
+	   && m_cost_type == VLS_VECTOR_COST)
+    return false;
 
   return vector_costs::better_main_loop_than_p (other);
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
index 948b20b68d3..0a1d1f72e6b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
@@ -21,6 +21,6 @@  f (int8_t *restrict a, int8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
 /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
 /* { dg-final { scan-assembler {\tvand} { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
index 7b23cafab3f..05220c32c5d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
@@ -21,6 +21,6 @@  f (uint8_t *restrict a, uint8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
 /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param riscv-autovec-lmul=m1"} } } } */
 /* { dg-final { scan-assembler-not {\tvmul} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
index 3622c59c439..5e64231b37d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
@@ -21,4 +21,4 @@  f (int8_t *restrict a, int8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
index 5c0a6775474..c78b3709078 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
@@ -21,4 +21,4 @@  f (int8_t *restrict a, int8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */