From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Co-authored-by: kito-cheng <kito.cheng@sifive.com>
Co-authored-by: kito-cheng <kito.cheng@gmail.com>
This patch fixes the issue of incorrect reigster order of RVV.
The new register order is coming from kito original RVV GCC implementation.
Consider this case:
void f (void *base,void *base2,void *out,size_t vl, int n)
{
vuint64m8_t bindex = __riscv_vle64_v_u64m8 (base + 100, vl);
for (int i = 0; i < n; i++){
vbool8_t m = __riscv_vlm_v_b8 (base + i, vl);
vuint64m8_t v = __riscv_vluxei64_v_u64m8_m(m,base,bindex,vl);
vuint64m8_t v2 = __riscv_vle64_v_u64m8_tu (v, base2 + i, vl);
vint8m1_t v3 = __riscv_vluxei64_v_i8m1_m(m,base,v,vl);
vint8m1_t v4 = __riscv_vluxei64_v_i8m1_m(m,base,v2,vl);
__riscv_vse8_v_i8m1 (out + 100*i,v3,vl);
__riscv_vse8_v_i8m1 (out + 222*i,v4,vl);
}
}
Before this patch:
f:
csrr t0,vlenb
slli t1,t0,3
sub sp,sp,t1
addi a5,a0,100
vsetvli zero,a3,e64,m8,ta,ma
vle64.v v24,0(a5)
vs8r.v v24,0(sp)
ble a4,zero,.L1
mv a6,a0
add a4,a4,a0
mv a5,a2
.L3:
vsetvli zero,zero,e64,m8,ta,ma
vl8re64.v v24,0(sp)
vlm.v v0,0(a6)
vluxei64.v v24,(a0),v24,v0.t
addi a6,a6,1
vsetvli zero,zero,e8,m1,tu,ma
vmv8r.v v16,v24
vluxei64.v v8,(a0),v24,v0.t
vle64.v v16,0(a1)
vluxei64.v v24,(a0),v16,v0.t
vse8.v v8,0(a2)
vse8.v v24,0(a5)
addi a1,a1,1
addi a2,a2,100
addi a5,a5,222
bne a4,a6,.L3
.L1:
csrr t0,vlenb
slli t1,t0,3
add sp,sp,t1
jr ra
After this patch:
f:
addi a5,a0,100
vsetvli zero,a3,e64,m8,ta,ma
vle64.v v24,0(a5)
ble a4,zero,.L1
mv a6,a0
add a4,a4,a0
mv a5,a2
.L3:
vsetvli zero,zero,e64,m8,ta,ma
vlm.v v0,0(a6)
addi a6,a6,1
vluxei64.v v8,(a0),v24,v0.t
vsetvli zero,zero,e8,m1,tu,ma
vmv8r.v v16,v8
vluxei64.v v2,(a0),v8,v0.t
vle64.v v16,0(a1)
vluxei64.v v1,(a0),v16,v0.t
vse8.v v2,0(a2)
vse8.v v1,0(a5)
addi a1,a1,1
addi a2,a2,100
addi a5,a5,222
bne a4,a6,.L3
.L1:
ret
The redundant register spillings is eliminated.
However, there is one more issue need to be addressed which is the redundant
move instruction "vmv8r.v". This is another story, and it will be fixed by another
patch (Fine tune RVV machine description RA constraint).
gcc/ChangeLog:
* config/riscv/riscv.h (enum reg_class): Fix RVV register order.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/base/spill-4.c: Adapt testcase.
* gcc.target/riscv/rvv/base/spill-6.c: Adapt testcase.
* gcc.target/riscv/rvv/base/reg_order-1.c: New test.
Signed-off-by: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Co-authored-by: kito-cheng <kito.cheng@sifive.com>
Co-authored-by: kito-cheng <kito.cheng@gmail.com>
---
gcc/config/riscv/riscv.h | 13 ++++----
.../gcc.target/riscv/rvv/base/reg_order-1.c | 20 ++++++++++++
.../gcc.target/riscv/rvv/base/spill-4.c | 32 +++++++++----------
.../gcc.target/riscv/rvv/base/spill-6.c | 16 +++++-----
4 files changed, 50 insertions(+), 31 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/reg_order-1.c
@@ -553,13 +553,12 @@ enum reg_class
60, 61, 62, 63, \
/* Call-saved FPRs. */ \
40, 41, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, \
- /* V24 ~ V31. */ \
- 120, 121, 122, 123, 124, 125, 126, 127, \
- /* V8 ~ V23. */ \
- 104, 105, 106, 107, 108, 109, 110, 111, \
- 112, 113, 114, 115, 116, 117, 118, 119, \
- /* V0 ~ V7. */ \
- 96, 97, 98, 99, 100, 101, 102, 103, \
+ /* v1 ~ v31 vector registers. */ \
+ 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, \
+ 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, \
+ 124, 125, 126, 127, \
+ /* The vector mask register. */ \
+ 96, \
/* None of the remaining classes have defined call-saved \
registers. */ \
64, 65, 66, 67 \
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void f (void *base,void *base2,void *out,size_t vl, int n)
+{
+ vuint64m8_t bindex = __riscv_vle64_v_u64m8 (base + 100, vl);
+ for (int i = 0; i < n; i++){
+ vbool8_t m = __riscv_vlm_v_b8 (base + i, vl);
+ vuint64m8_t v = __riscv_vluxei64_v_u64m8_m(m,base,bindex,vl);
+ vuint64m8_t v2 = __riscv_vle64_v_u64m8_tu (v, base2 + i, vl);
+ vint8m1_t v3 = __riscv_vluxei64_v_i8m1_m(m,base,v,vl);
+ vint8m1_t v4 = __riscv_vluxei64_v_i8m1_m(m,base,v2,vl);
+ __riscv_vse8_v_i8m1 (out + 100*i,v3,vl);
+ __riscv_vse8_v_i8m1 (out + 222*i,v4,vl);
+ }
+}
+
+/* { dg-final { scan-assembler-not {csrr} } } */
@@ -10,7 +10,7 @@
** csrr\tt0,vlenb
** sub\tsp,sp,t0
** ...
-** vs1r.v\tv24,0\(sp\)
+** vs1r.v\tv[0-9]+,0\(sp\)
** ...
** vl1re64.v\tv2,0\(sp\)
** vs1r.v\tv2,0\(a1\)
@@ -34,7 +34,7 @@ spill_4 (int64_t *in, int64_t *out)
** slli\tt1,t0,1
** sub\tsp,sp,t1
** ...
-** vs2r.v\tv24,0\(sp\)
+** vs2r.v\tv[0-9]+,0\(sp\)
** ...
** vl2re64.v\tv4,0\(sp\)
** vs2r.v\tv4,0\(a1\)
@@ -58,10 +58,10 @@ spill_5 (int64_t *in, int64_t *out)
** slli\tt1,t0,2
** sub\tsp,sp,t1
** ...
-** vs4r.v\tv24,0\(sp\)
+** vs4r.v\tv[0-9]+,0\(sp\)
** ...
-** vl4re64.v\tv8,0\(sp\)
-** vs4r.v\tv8,0\(a1\)
+** vl4re64.v\tv[0-9]+,0\(sp\)
+** vs4r.v\tv[0-9]+,0\(a1\)
** ...
** jr\tra
*/
@@ -82,10 +82,10 @@ spill_6 (int64_t *in, int64_t *out)
** slli\tt1,t0,3
** sub\tsp,sp,t1
** ...
-** vs8r.v\tv24,0\(sp\)
+** vs8r.v\tv[0-9]+,0\(sp\)
** ...
-** vl8re64.v\tv16,0\(sp\)
-** vs8r.v\tv16,0\(a1\)
+** vl8re64.v\tv[0-9]+,0\(sp\)
+** vs8r.v\tv[0-9]+,0\(a1\)
** ...
** jr\tra
*/
@@ -105,7 +105,7 @@ spill_7 (int64_t *in, int64_t *out)
** csrr\tt0,vlenb
** sub\tsp,sp,t0
** ...
-** vs1r.v\tv24,0\(sp\)
+** vs1r.v\tv[0-9]+,0\(sp\)
** ...
** vl1re64.v\tv2,0\(sp\)
** vs1r.v\tv2,0\(a1\)
@@ -129,7 +129,7 @@ spill_11 (uint64_t *in, uint64_t *out)
** slli\tt1,t0,1
** sub\tsp,sp,t1
** ...
-** vs2r.v\tv24,0\(sp\)
+** vs2r.v\tv[0-9]+,0\(sp\)
** ...
** vl2re64.v\tv4,0\(sp\)
** vs2r.v\tv4,0\(a1\)
@@ -153,10 +153,10 @@ spill_12 (uint64_t *in, uint64_t *out)
** slli\tt1,t0,2
** sub\tsp,sp,t1
** ...
-** vs4r.v\tv24,0\(sp\)
+** vs4r.v\tv[0-9]+,0\(sp\)
** ...
-** vl4re64.v\tv8,0\(sp\)
-** vs4r.v\tv8,0\(a1\)
+** vl4re64.v\tv[0-9]+,0\(sp\)
+** vs4r.v\tv[0-9]+,0\(a1\)
** ...
** jr\tra
*/
@@ -177,10 +177,10 @@ spill_13 (uint64_t *in, uint64_t *out)
** slli\tt1,t0,3
** sub\tsp,sp,t1
** ...
-** vs8r.v\tv24,0\(sp\)
+** vs8r.v\tv[0-9]+,0\(sp\)
** ...
-** vl8re64.v\tv16,0\(sp\)
-** vs8r.v\tv16,0\(a1\)
+** vl8re64.v\tv[0-9]+,0\(sp\)
+** vs8r.v\tv[0-9]+,0\(a1\)
** ...
** jr\tra
*/
@@ -10,10 +10,10 @@
** csrr\tt0,vlenb
** sub\tsp,sp,t0
** ...
-** vs1r.v\tv24,0\(sp\)
+** vs1r.v\tv[0-9]+,0\(sp\)
** ...
-** vl1re64.v\tv2,0\(sp\)
-** vs1r.v\tv2,0\(a1\)
+** vl1re64.v\tv[0-9]+,0\(sp\)
+** vs1r.v\tv[0-9]+,0\(a1\)
** ...
** jr\tra
*/
@@ -34,7 +34,7 @@ spill_4 (double *in, double *out)
** slli\tt1,t0,1
** sub\tsp,sp,t1
** ...
-** vs2r.v\tv24,0\(sp\)
+** vs2r.v\tv[0-9]+,0\(sp\)
** ...
** vl2re64.v\tv4,0\(sp\)
** vs2r.v\tv4,0\(a1\)
@@ -58,7 +58,7 @@ spill_5 (double *in, double *out)
** slli\tt1,t0,2
** sub\tsp,sp,t1
** ...
-** vs4r.v\tv24,0\(sp\)
+** vs4r.v\tv[0-9]+,0\(sp\)
** ...
** vl4re64.v\tv8,0\(sp\)
** vs4r.v\tv8,0\(a1\)
@@ -82,10 +82,10 @@ spill_6 (double *in, double *out)
** slli\tt1,t0,3
** sub\tsp,sp,t1
** ...
-** vs8r.v\tv24,0\(sp\)
+** vs8r.v\tv[0-9]+,0\(sp\)
** ...
-** vl8re64.v\tv16,0\(sp\)
-** vs8r.v\tv16,0\(a1\)
+** vl8re64.v\tv[0-9]+,0\(sp\)
+** vs8r.v\tv[0-9]+,0\(a1\)
** ...
** jr\tra
*/