From patchwork Thu Jan 11 02:42:23 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai>
X-Patchwork-Id: 83811
Return-Path: <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 60CDB385801C
	for <patchwork@sourceware.org>; Thu, 11 Jan 2024 02:43:05 +0000 (GMT)
X-Original-To: gcc-patches@gcc.gnu.org
Delivered-To: gcc-patches@gcc.gnu.org
Received: from smtpbgsg2.qq.com (smtpbgsg2.qq.com [54.254.200.128])
 by sourceware.org (Postfix) with ESMTPS id B76343858D38
 for <gcc-patches@gcc.gnu.org>; Thu, 11 Jan 2024 02:42:31 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org B76343858D38
Authentication-Results: sourceware.org;
 dmarc=none (p=none dis=none) header.from=rivai.ai
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=rivai.ai
ARC-Filter: OpenARC Filter v1.0.0 sourceware.org B76343858D38
Authentication-Results: server2.sourceware.org;
 arc=none smtp.remote-ip=54.254.200.128
ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1704940955; cv=none;
 b=VOL+jzj+g9+nxTyLMDtWgHReL7tv+rVZAa2Luwt/imf5dnyIh6ggK44pfC9LaJAlRzhu+sW6MxZS8Y/D+D2iiwPWvYtfwjO5kTHIhsdvcybvK5huybNWyJZbSJlNq/wlZtEunWJnBowJQMMhtFN3txhSvujhKljHSkd+eKtkgEA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key;
 t=1704940955; c=relaxed/simple;
 bh=xcofrw5ljfwC41G8pnTEOYinYtsN9HGvkVLyfCxEj9g=;
 h=From:To:Subject:Date:Message-Id:MIME-Version;
 b=XLnW62SLvWXoha+V7DLU0teHPTmny8UIcDQETW20zjzwatpjqWx37TcB0nryYfX1kyk5KVWRNvDikXtztPu6pQXqXqaDTrji17pvLXirLjS8QF+DxAt9ETC1hurHJFrSHru4XSZj36T1jeVKtF6HIrGuONrD/ALVE+iac6gKs3k=
ARC-Authentication-Results: i=1; server2.sourceware.org
X-QQ-mid: bizesmtp69t1704940945tv5oecjv
Received: from rios-cad122.hadoop.rioslab.org ( [58.60.1.26])
 by bizesmtp.qq.com (ESMTP) with
 id ; Thu, 11 Jan 2024 10:42:24 +0800 (CST)
X-QQ-SSF: 01400000000000G0V000000A0000000
X-QQ-FEAT: znfcQSa1hKbVlVnVaOsPFwYtblmNo1el2zuI2W8YXSpds42Iv4JNC6L3vW106
 HV5muEIg5A3B5pn5Ize3Y5fWqY3+k/VIG1usPh3p6i28xR1L5juFn0Sa2HX6vNJWFI5NXEi
 AgZhBWv87sNDPbjALSxyRcKjEGHp1jlW9BVCF8xOzNQtXpeUMtKkJuytH+cB6/yGni+xAxt
 IKmK5lWa+h+ChOIDtHXPa3ET3Gj2DJBbBs7rw3bmxM1bqqkZskcb5SClZjhJEoRPDmkiT37
 zmOdAejK+nWuCH5dDq2W+DgldyEDD7XEE+d0P3q06YQmNK1zs7KeNsddA6ksaJRgtOIdSLH
 hhXpH3fzpJ5JZeVvYmedXoMMKiu6XGrjRAjGD488F5GvlAR6GeVfiLJT0Cumg==
X-QQ-GoodBg: 2
X-BIZMAIL-ID: 5198565137964072905
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
To: gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com, kito.cheng@sifive.com, jeffreyalaw@gmail.com,
 rdapp.gcc@gmail.com, Juzhe-Zhong <juzhe.zhong@rivai.ai>
Subject: [PATCH] RISC-V: VLA preempts VLS on unknown NITERS loop
Date: Thu, 11 Jan 2024 10:42:23 +0800
Message-Id: <20240111024223.264227-1-juzhe.zhong@rivai.ai>
X-Mailer: git-send-email 2.36.3
MIME-Version: 1.0
X-QQ-SENDSIZE: 520
Feedback-ID: bizesmtp:rivai.ai:qybglogicsvrgz:qybglogicsvrgz7a-one-0
X-Spam-Status: No, score=-9.8 required=5.0 tests=BAYES_00, GIT_PATCH_0,
 KAM_DMARC_STATUS, KAM_SHORT, RCVD_IN_BARRACUDACENTRAL, RCVD_IN_DNSWL_NONE,
 RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SCC_5_SHORT_WORD_LINES, SPF_HELO_PASS,
 SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.30
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org

This patch fixes the known issues on SLP cases:

	ble	a2,zero,.L11
	addiw	t1,a2,-1
	li	a5,15
	bleu	t1,a5,.L9
	srliw	a7,t1,4
	slli	a7,a7,7
	lui	t3,%hi(.LANCHOR0)
	lui	a6,%hi(.LANCHOR0+128)
	addi	t3,t3,%lo(.LANCHOR0)
	li	a4,128
	addi	a6,a6,%lo(.LANCHOR0+128)
	add	a7,a7,a0
	addi	a3,a1,37
	mv	a5,a0
	vsetvli	zero,a4,e8,m8,ta,ma
	vle8.v	v24,0(t3)
	vle8.v	v16,0(a6)
.L4:
	li	a6,128
	vle8.v	v0,0(a3)
	vrgather.vv	v8,v0,v24
	vadd.vv	v8,v8,v16
	vse8.v	v8,0(a5)
	add	a5,a5,a6
	add	a3,a3,a6
	bne	a5,a7,.L4
	andi	a5,t1,-16
	mv	t1,a5
.L3:
	subw	a2,a2,a5
	li	a4,1
	beq	a2,a4,.L5
	slli	a5,a5,32
	srli	a5,a5,32
	addiw	a2,a2,-1
	slli	a5,a5,3
	csrr	a4,vlenb
	slli	a6,a2,32
	addi	t3,a5,37
	srli	a3,a6,29
	slli	a4,a4,2
	add	t3,a1,t3
	add	a5,a0,a5
	mv	t5,a3
	bgtu	a3,a4,.L14
.L6:
	li	a4,50790400
	addi	a4,a4,1541
	li	a6,67633152
	addi	a6,a6,513
	slli	a4,a4,32
	add	a4,a4,a6
	vsetvli	t4,zero,e64,m4,ta,ma
	vmv.v.x	v16,a4
	vsetvli	a6,zero,e16,m8,ta,ma
	vid.v	v8
	vsetvli	zero,t5,e8,m4,ta,ma
	vle8.v	v20,0(t3)
	vsetvli	a6,zero,e16,m8,ta,ma
	csrr	a7,vlenb
	vand.vi	v8,v8,-8
	vsetvli	zero,zero,e8,m4,ta,ma
	slli	a4,a7,2
	vrgatherei16.vv	v4,v20,v8
	vadd.vv	v4,v4,v16
	vsetvli	zero,t5,e8,m4,ta,ma
	vse8.v	v4,0(a5)
	bgtu	a3,a4,.L15
.L7:
	addw	t1,a2,t1
.L5:
	slliw	a5,t1,3
	add	a1,a1,a5
	lui	a4,%hi(.LC2)
	add	a0,a0,a5
	lbu	a3,37(a1)
	addi	a5,a4,%lo(.LC2)
	vsetivli	zero,8,e8,mf2,ta,ma
	vmv.v.x	v1,a3
	vle8.v	v2,0(a5)
	vadd.vv	v1,v1,v2
	vse8.v	v1,0(a0)
.L11:
	ret
.L15:
	sub	a3,a3,a4
	bleu	a3,a4,.L8
	mv	a3,a4
.L8:
	li	a7,50790400
	csrr	a4,vlenb
	slli	a4,a4,2
	addi	a7,a7,1541
	li	t4,67633152
	add	t3,t3,a4
	vsetvli	zero,a3,e8,m4,ta,ma
	slli	a7,a7,32
	addi	t4,t4,513
	vle8.v	v20,0(t3)
	add	a4,a5,a4
	add	a7,a7,t4
	vsetvli	a5,zero,e64,m4,ta,ma
	vmv.v.x	v16,a7
	vsetvli	a6,zero,e16,m8,ta,ma
	vid.v	v8
	vand.vi	v8,v8,-8
	vsetvli	zero,zero,e8,m4,ta,ma
	vrgatherei16.vv	v4,v20,v8
	vadd.vv	v4,v4,v16
	vsetvli	zero,a3,e8,m4,ta,ma
	vse8.v	v4,0(a4)
	j	.L7
.L14:
	mv	t5,a4
	j	.L6
.L9:
	li	a5,0
	li	t1,0
	j	.L3

The vectorization codegen is quite inefficient since we choose a VLS modes to vectorize the loop body
with epilogue choosing a VLA modes.

cost.c:6:21: note:  ***** Choosing vector mode V128QI
cost.c:6:21: note:  ***** Choosing epilogue vector mode RVVM4QI

As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support partial vectors wheras
VLSmodes support full vectors.  The goal we add VLSmodes is to improve the codegen of known NITERS
or SLP codes.

If NITERS is unknown, that is i < n, n is unknown. We will always have partial vectors vectorization.
It can be loop body or epilogue. In this case, It's always more efficient to apply VLA partial vectorization
on loop body which doesn't have epilogue.

After this patch:

f:
	ble	a2,zero,.L7
	li	a5,1
	beq	a2,a5,.L5
	li	a6,50790400
	addi	a6,a6,1541
	li	a4,67633152
	addi	a4,a4,513
	csrr	a5,vlenb
	addiw	a2,a2,-1
	slli	a6,a6,32
	add	a6,a6,a4
	slli	a5,a5,2
	slli	a4,a2,32
	vsetvli	t1,zero,e64,m4,ta,ma
	srli	a3,a4,29
	neg	t4,a5
	addi	a7,a1,37
	mv	a4,a0
	vmv.v.x	v12,a6
	vsetvli	t3,zero,e16,m8,ta,ma
	vid.v	v16
	vand.vi	v16,v16,-8
.L4:
	minu	a6,a3,a5
	vsetvli	zero,a6,e8,m4,ta,ma
	vle8.v	v8,0(a7)
	vsetvli	t3,zero,e8,m4,ta,ma
	mv	t1,a3
	vrgatherei16.vv	v4,v8,v16
	vsetvli	zero,a6,e8,m4,ta,ma
	vadd.vv	v4,v4,v12
	vse8.v	v4,0(a4)
	add	a7,a7,a5
	add	a4,a4,a5
	add	a3,a3,t4
	bgtu	t1,a5,.L4
.L3:
	slliw	a2,a2,3
	add	a1,a1,a2
	lui	a5,%hi(.LC0)
	lbu	a4,37(a1)
	add	a0,a0,a2
	addi	a5,a5,%lo(.LC0)
	vsetivli	zero,8,e8,mf2,ta,ma
	vmv.v.x	v1,a4
	vle8.v	v2,0(a5)
	vadd.vv	v1,v1,v2
	vse8.v	v1,0(a0)
.L7:
	ret

Tested on both RV32 and RV64 no regression. Ok for trunk ?

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): VLA preempt VLS on unknown NITERS loop.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/partial/slp-1.c: Remove xfail.
	* gcc.target/riscv/rvv/autovec/partial/slp-16.c: Ditto.
	* gcc.target/riscv/rvv/autovec/partial/slp-3.c: Ditto.
	* gcc.target/riscv/rvv/autovec/partial/slp-5.c: Ditto.
---
 gcc/config/riscv/riscv-vector-costs.cc                   | 9 +++++++++
 .../gcc.target/riscv/rvv/autovec/partial/slp-1.c         | 2 +-
 .../gcc.target/riscv/rvv/autovec/partial/slp-16.c        | 2 +-
 .../gcc.target/riscv/rvv/autovec/partial/slp-3.c         | 2 +-
 .../gcc.target/riscv/rvv/autovec/partial/slp-5.c         | 2 +-
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index e53f4a186f3..58ec0b9b503 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1042,6 +1042,15 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 	    }
 	}
     }
+  /* If NITERS is unknown, we should not use VLS modes to vectorize
+     the loop since we don't support partial vectors for VLS modes,
+     that is, we will have full vectors (VLSmodes) on loop body
+     and partial vectors (VLAmodes) on loop epilogue which is very
+     inefficient.  Instead, we should apply partial vectors (VLAmodes)
+     on loop body without an epilogue on unknown NITERS loop.  */
+  else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
+	   && m_cost_type == VLS_VECTOR_COST)
+    return false;
 
   return vector_costs::better_main_loop_than_p (other);
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
index 948b20b68d3..0a1d1f72e6b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
@@ -21,6 +21,6 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
 /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
 /* { dg-final { scan-assembler {\tvand} { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
index 7b23cafab3f..05220c32c5d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
@@ -21,6 +21,6 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
 /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param riscv-autovec-lmul=m1"} } } } */
 /* { dg-final { scan-assembler-not {\tvmul} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
index 3622c59c439..5e64231b37d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
@@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
index 5c0a6775474..c78b3709078 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
@@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
 
 /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are chosen
    instead of SLP when riscv-autovec-lmul=m1.  */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } } } */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { any-opts "--param riscv-autovec-lmul=m1" } } } } */