RISC-V: Support highpart register overlap for vwcvt

Message ID 20231129083410.2419576-1-juzhe.zhong@rivai.ai
State Committed
Commit bdad036da32f72b84a96070518e7d75c21706dc2
Headers
Series RISC-V: Support highpart register overlap for vwcvt |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
rivoscibot/toolchain-ci-rivos-apply-patch success Patch applied
rivoscibot/toolchain-ci-rivos-lint warning Lint failed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gc-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-non-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv32gc_zba_zbb_zbc_zbs-ilp32d-non-multilib success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 warning Patch is already merged
rivoscibot/toolchain-ci-rivos-test success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm warning Patch is already merged

Commit Message

juzhe.zhong@rivai.ai Nov. 29, 2023, 8:34 a.m. UTC
  Since Richard supports register filters recently, we are able to support highpart register
overlap for widening RVV instructions.

This patch support it for vwcvt intrinsics.

I leverage real application user codes for vwcvt:
https://github.com/riscv/riscv-v-spec/issues/929
https://godbolt.org/z/xoeGnzd8q

This is the real application codes that using LMUL = 8 with unrolling to gain optimal
performance for specific libraury.

You can see in the codegen, GCC has optimal codegen for such since we supported register
lowpart overlap for narrowing instructions (dest EEW < source EEW).

Now, we start to support highpart register overlap from this patch for widening instructions (dest EEW > source EEW).

Leverage this intrinsic codes above but for vwcvt:

https://godbolt.org/z/1TMPE5Wfr

size_t
foo (char const *buf, size_t len)
{
  size_t sum = 0;
  size_t vl = __riscv_vsetvlmax_e8m8 ();
  size_t step = vl * 4;
  const char *it = buf, *end = buf + len;
  for (; it + step <= end;)
    {
      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;

      asm volatile("nop" ::: "memory");
      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);

      asm volatile("nop" ::: "memory");
      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);

      sum += sumation (sum0, sum1, sum2, sum3);
    }
  return sum;
}

Before this patch:

...
csrr    t0,vlenb
...
        vwcvt.x.x.v     v16,v8
        vwcvt.x.x.v     v8,v28
        vs8r.v  v16,0(sp)               ---> spill
        vwcvt.x.x.v     v16,v24
        vwcvt.x.x.v     v24,v4
        nop
        vsetvli zero,zero,e16,m8,ta,ma
        vmv.x.s a2,v16
        vl8re16.v       v16,0(sp)      --->  reload
...
csrr    t0,vlenb
...

You can see heavy spill && reload inside the loop body.

After this patch:

...
	vwcvt.x.x.v	v8,v12
	vwcvt.x.x.v	v16,v20
	vwcvt.x.x.v	v24,v28
	vwcvt.x.x.v	v0,v4
...

Optimal codegen after this patch.

Tested on zvl128b no regression.

I am gonna to test zve64d/zvl256b/zvl512b/zvl1024b.

Ok for trunk if no regression on the testing above ?

Co-authored-by: kito-cheng <kito.cheng@sifive.com>
Co-authored-by: kito-cheng <kito.cheng@gmail.com>

	PR target/112431

gcc/ChangeLog:

	* config/riscv/constraints.md (TARGET_VECTOR ? V_REGS : NO_REGS): New register filters.
	* config/riscv/riscv.md (no,W21,W42,W84,W41,W81,W82): Ditto.
	(no,yes): Ditto.
	* config/riscv/vector.md: Support highpart register overlap for vwcvt.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/pr112431-1.c: New test.
	* gcc.target/riscv/rvv/base/pr112431-2.c: New test.
	* gcc.target/riscv/rvv/base/pr112431-3.c: New test.

---
 gcc/config/riscv/constraints.md               |  23 ++++
 gcc/config/riscv/riscv.md                     |  24 ++++
 gcc/config/riscv/vector.md                    |  19 ++--
 .../gcc.target/riscv/rvv/base/pr112431-1.c    | 104 ++++++++++++++++++
 .../gcc.target/riscv/rvv/base/pr112431-2.c    |  68 ++++++++++++
 .../gcc.target/riscv/rvv/base/pr112431-3.c    |  51 +++++++++
 6 files changed, 280 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
  

Comments

juzhe.zhong@rivai.ai Nov. 29, 2023, 8:35 a.m. UTC | #1
Sorry for sending it twice.

Add

Co-authored-by: kito-cheng <kito.cheng@sifive.com>
Co-authored-by: kito-cheng <kito.cheng@gmail.com>in changelog.
No other difference.


juzhe.zhong@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-11-29 16:34
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH] RISC-V: Support highpart register overlap for vwcvt
Since Richard supports register filters recently, we are able to support highpart register
overlap for widening RVV instructions.
 
This patch support it for vwcvt intrinsics.
 
I leverage real application user codes for vwcvt:
https://github.com/riscv/riscv-v-spec/issues/929
https://godbolt.org/z/xoeGnzd8q
 
This is the real application codes that using LMUL = 8 with unrolling to gain optimal
performance for specific libraury.
 
You can see in the codegen, GCC has optimal codegen for such since we supported register
lowpart overlap for narrowing instructions (dest EEW < source EEW).
 
Now, we start to support highpart register overlap from this patch for widening instructions (dest EEW > source EEW).
 
Leverage this intrinsic codes above but for vwcvt:
 
https://godbolt.org/z/1TMPE5Wfr
 
size_t
foo (char const *buf, size_t len)
{
  size_t sum = 0;
  size_t vl = __riscv_vsetvlmax_e8m8 ();
  size_t step = vl * 4;
  const char *it = buf, *end = buf + len;
  for (; it + step <= end;)
    {
      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
 
      asm volatile("nop" ::: "memory");
      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
 
      asm volatile("nop" ::: "memory");
      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
 
      sum += sumation (sum0, sum1, sum2, sum3);
    }
  return sum;
}
 
Before this patch:
 
...
csrr    t0,vlenb
...
        vwcvt.x.x.v     v16,v8
        vwcvt.x.x.v     v8,v28
        vs8r.v  v16,0(sp)               ---> spill
        vwcvt.x.x.v     v16,v24
        vwcvt.x.x.v     v24,v4
        nop
        vsetvli zero,zero,e16,m8,ta,ma
        vmv.x.s a2,v16
        vl8re16.v       v16,0(sp)      --->  reload
...
csrr    t0,vlenb
...
 
You can see heavy spill && reload inside the loop body.
 
After this patch:
 
...
vwcvt.x.x.v v8,v12
vwcvt.x.x.v v16,v20
vwcvt.x.x.v v24,v28
vwcvt.x.x.v v0,v4
...
 
Optimal codegen after this patch.
 
Tested on zvl128b no regression.
 
I am gonna to test zve64d/zvl256b/zvl512b/zvl1024b.
 
Ok for trunk if no regression on the testing above ?
 
Co-authored-by: kito-cheng <kito.cheng@sifive.com>
Co-authored-by: kito-cheng <kito.cheng@gmail.com>
 
PR target/112431
 
gcc/ChangeLog:
 
* config/riscv/constraints.md (TARGET_VECTOR ? V_REGS : NO_REGS): New register filters.
* config/riscv/riscv.md (no,W21,W42,W84,W41,W81,W82): Ditto.
(no,yes): Ditto.
* config/riscv/vector.md: Support highpart register overlap for vwcvt.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/pr112431-1.c: New test.
* gcc.target/riscv/rvv/base/pr112431-2.c: New test.
* gcc.target/riscv/rvv/base/pr112431-3.c: New test.
 
---
gcc/config/riscv/constraints.md               |  23 ++++
gcc/config/riscv/riscv.md                     |  24 ++++
gcc/config/riscv/vector.md                    |  19 ++--
.../gcc.target/riscv/rvv/base/pr112431-1.c    | 104 ++++++++++++++++++
.../gcc.target/riscv/rvv/base/pr112431-2.c    |  68 ++++++++++++
.../gcc.target/riscv/rvv/base/pr112431-3.c    |  51 +++++++++
6 files changed, 280 insertions(+), 9 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
 
diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 68be4515c04..19bb36616bf 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -169,6 +169,29 @@
(define_register_constraint "vm" "TARGET_VECTOR ? VM_REGS : NO_REGS"
   "A vector mask register (if available).")
+;; These following constraints are used by RVV instructions with dest EEW > src EEW.
+;; RISC-V 'V' Spec 5.2. Vector Operands:
+;; The destination EEW is greater than the source EEW, the source EMUL is at least 1,
+;; and the overlap is in the highest-numbered part of the destination register group.
+;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not).
+(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 2 == 1." "regno % 2 == 1")
+
+(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 4 == 2." "regno % 4 == 2")
+
+(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 4." "regno % 8 == 4")
+
+(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 4 == 1." "regno % 4 == 1")
+
+(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 1." "regno % 8 == 1")
+
+(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 2." "regno % 8 == 2")
+
;; This constraint is used to match instruction "csrr %0, vlenb" which is generated in "mov<mode>".
;; VLENB is a run-time constant which represent the vector register length in bytes.
;; BYTES_PER_RISCV_VECTOR represent runtime invariant of vector register length in bytes.
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 935eeb7fd8e..6bf2dfdf9b4 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -501,6 +501,27 @@
   ]
   (const_string "no")))
+(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
+  (const_string "no"))
+
+(define_attr "vconstraint_enabled" "no,yes"
+  (cond [(eq_attr "vconstraint" "no")
+         (const_string "yes")
+
+         (and (eq_attr "vconstraint" "W21")
+       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 2"))
+ (const_string "no")
+
+         (and (eq_attr "vconstraint" "W42,W41")
+       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4"))
+ (const_string "no")
+
+         (and (eq_attr "vconstraint" "W84,W81,W82")
+       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8"))
+ (const_string "no")
+        ]
+       (const_string "yes")))
+
;; Attribute to control enable or disable instructions.
(define_attr "enabled" "no,yes"
   (cond [
@@ -509,6 +530,9 @@
     (eq_attr "fp_vector_disabled" "yes")
     (const_string "no")
+
+    (eq_attr "vconstraint_enabled" "no")
+    (const_string "no")
   ]
   (const_string "yes")))
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index ba9c9e5a9b6..bace900fee5 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3898,22 +3898,22 @@
;; vwcvt<u>.x.x.v
(define_insn "@pred_<optab><mode>"
-  [(set (match_operand:VWEXTI 0 "register_operand"                  "=&vr,&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")
(if_then_else:VWEXTI
  (unspec:<VM>
-     [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1")
-      (match_operand 4 "vector_length_operand"              "   rK,   rK")
-      (match_operand 5 "const_int_operand"                  "    i,    i")
-      (match_operand 6 "const_int_operand"                  "    i,    i")
-      (match_operand 7 "const_int_operand"                  "    i,    i")
+     [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
+      (match_operand 4 "vector_length_operand"              "   rK,   rK,   rK,   rK,   rK,   rK,   rK,   rK")
+      (match_operand 5 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
+      (match_operand 6 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
+      (match_operand 7 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
     (reg:SI VL_REGNUM)
     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
  (plus:VWEXTI
    (any_extend:VWEXTI
-       (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,   vr"))
+       (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "  W21,  W21,  W42,  W42,  W84,  W84,   vr,   vr"))
    (vec_duplicate:VWEXTI
      (reg:<VEL> X0_REGNUM)))
-   (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0")))]
+   (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0,   vu,    0,   vu,    0,   vu,    0")))]
   "TARGET_VECTOR"
   "vwcvt<u>.x.x.v\t%0,%3%p1"
   [(set_attr "type" "viwalu")
@@ -3921,7 +3921,8 @@
    (set_attr "vl_op_idx" "4")
    (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[5])"))
    (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
-   (set (attr "avl_type_idx") (const_int 7))])
+   (set (attr "avl_type_idx") (const_int 7))
+   (set_attr "vconstraint" "W21,W21,W42,W42,W84,W84,no,no")])
;; -------------------------------------------------------------------------------
;; ---- Predicated integer Narrowing operations
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
new file mode 100644
index 00000000000..6b9a7c448f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
@@ -0,0 +1,104 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+   size_t sum5, size_t sum6, size_t sum7, size_t sum8, size_t sum9,
+   size_t sum10, size_t sum11, size_t sum12, size_t sum13, size_t sum14,
+   size_t sum15)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8 + sum9
+ + sum10 + sum11 + sum12 + sum13 + sum14 + sum15;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v8 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v9 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v10 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v11 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v12 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v13 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v14 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v15 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      
+      asm volatile("nop" ::: "memory");
+      vint16m2_t vw0 = __riscv_vwcvt_x_x_v_i16m2 (v0, vl);
+      vint16m2_t vw1 = __riscv_vwcvt_x_x_v_i16m2 (v1, vl);
+      vint16m2_t vw2 = __riscv_vwcvt_x_x_v_i16m2 (v2, vl);
+      vint16m2_t vw3 = __riscv_vwcvt_x_x_v_i16m2 (v3, vl);
+      vint16m2_t vw4 = __riscv_vwcvt_x_x_v_i16m2 (v4, vl);
+      vint16m2_t vw5 = __riscv_vwcvt_x_x_v_i16m2 (v5, vl);
+      vint16m2_t vw6 = __riscv_vwcvt_x_x_v_i16m2 (v6, vl);
+      vint16m2_t vw7 = __riscv_vwcvt_x_x_v_i16m2 (v7, vl);
+      vint16m2_t vw8 = __riscv_vwcvt_x_x_v_i16m2 (v8, vl);
+      vint16m2_t vw9 = __riscv_vwcvt_x_x_v_i16m2 (v9, vl);
+      vint16m2_t vw10 = __riscv_vwcvt_x_x_v_i16m2 (v10, vl);
+      vint16m2_t vw11 = __riscv_vwcvt_x_x_v_i16m2 (v11, vl);
+      vint16m2_t vw12 = __riscv_vwcvt_x_x_v_i16m2 (v12, vl);
+      vint16m2_t vw13 = __riscv_vwcvt_x_x_v_i16m2 (v13, vl);
+      vint16m2_t vw14 = __riscv_vwcvt_x_x_v_i16m2 (v14, vl);
+      vint16m2_t vw15 = __riscv_vwcvt_x_x_v_i16m2 (v15, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m2_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m2_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m2_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m2_i16 (vw3);
+      size_t sum4 = __riscv_vmv_x_s_i16m2_i16 (vw4);
+      size_t sum5 = __riscv_vmv_x_s_i16m2_i16 (vw5);
+      size_t sum6 = __riscv_vmv_x_s_i16m2_i16 (vw6);
+      size_t sum7 = __riscv_vmv_x_s_i16m2_i16 (vw7);
+      size_t sum8 = __riscv_vmv_x_s_i16m2_i16 (vw8);
+      size_t sum9 = __riscv_vmv_x_s_i16m2_i16 (vw9);
+      size_t sum10 = __riscv_vmv_x_s_i16m2_i16 (vw10);
+      size_t sum11 = __riscv_vmv_x_s_i16m2_i16 (vw11);
+      size_t sum12 = __riscv_vmv_x_s_i16m2_i16 (vw12);
+      size_t sum13 = __riscv_vmv_x_s_i16m2_i16 (vw13);
+      size_t sum14 = __riscv_vmv_x_s_i16m2_i16 (vw14);
+      size_t sum15 = __riscv_vmv_x_s_i16m2_i16 (vw15);
+
+      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8,
+        sum9, sum10, sum11, sum12, sum13, sum14, sum15);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
new file mode 100644
index 00000000000..da92d59406f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+   size_t sum5, size_t sum6, size_t sum7)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v4 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v5 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v6 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v7 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+
+      asm volatile("nop" ::: "memory");
+      vint16m4_t vw0 = __riscv_vwcvt_x_x_v_i16m4 (v0, vl);
+      vint16m4_t vw1 = __riscv_vwcvt_x_x_v_i16m4 (v1, vl);
+      vint16m4_t vw2 = __riscv_vwcvt_x_x_v_i16m4 (v2, vl);
+      vint16m4_t vw3 = __riscv_vwcvt_x_x_v_i16m4 (v3, vl);
+      vint16m4_t vw4 = __riscv_vwcvt_x_x_v_i16m4 (v4, vl);
+      vint16m4_t vw5 = __riscv_vwcvt_x_x_v_i16m4 (v5, vl);
+      vint16m4_t vw6 = __riscv_vwcvt_x_x_v_i16m4 (v6, vl);
+      vint16m4_t vw7 = __riscv_vwcvt_x_x_v_i16m4 (v7, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m4_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m4_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m4_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m4_i16 (vw3);
+      size_t sum4 = __riscv_vmv_x_s_i16m4_i16 (vw4);
+      size_t sum5 = __riscv_vmv_x_s_i16m4_i16 (vw5);
+      size_t sum6 = __riscv_vmv_x_s_i16m4_i16 (vw6);
+      size_t sum7 = __riscv_vmv_x_s_i16m4_i16 (vw7);
+
+      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
new file mode 100644
index 00000000000..46f93a9049b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
+{
+  return sum0 + sum1 + sum2 + sum3;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+
+      asm volatile("nop" ::: "memory");
+      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
+      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
+      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
+      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
+
+      sum += sumation (sum0, sum1, sum2, sum3);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
-- 
2.36.3
  
Kito Cheng Nov. 29, 2023, 9:06 a.m. UTC | #2
OK, thanks for moving that forward!

but just one nit: don't include godbolt.org url as possible since it's
not permanently valid.

On Wed, Nov 29, 2023 at 4:36 PM juzhe.zhong@rivai.ai
<juzhe.zhong@rivai.ai> wrote:
>
> Sorry for sending it twice.
>
> Add
>
> Co-authored-by: kito-cheng <kito.cheng@sifive.com>
> Co-authored-by: kito-cheng <kito.cheng@gmail.com>
>
> in changelog.
>
>
> No other difference.
>
> ________________________________
> juzhe.zhong@rivai.ai
>
>
> From: Juzhe-Zhong
> Date: 2023-11-29 16:34
> To: gcc-patches
> CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
> Subject: [PATCH] RISC-V: Support highpart register overlap for vwcvt
> Since Richard supports register filters recently, we are able to support highpart register
> overlap for widening RVV instructions.
>
> This patch support it for vwcvt intrinsics.
>
> I leverage real application user codes for vwcvt:
> https://github.com/riscv/riscv-v-spec/issues/929
> https://godbolt.org/z/xoeGnzd8q
>
> This is the real application codes that using LMUL = 8 with unrolling to gain optimal
> performance for specific libraury.
>
> You can see in the codegen, GCC has optimal codegen for such since we supported register
> lowpart overlap for narrowing instructions (dest EEW < source EEW).
>
> Now, we start to support highpart register overlap from this patch for widening instructions (dest EEW > source EEW).
>
> Leverage this intrinsic codes above but for vwcvt:
>
> https://godbolt.org/z/1TMPE5Wfr
>
> size_t
> foo (char const *buf, size_t len)
> {
>   size_t sum = 0;
>   size_t vl = __riscv_vsetvlmax_e8m8 ();
>   size_t step = vl * 4;
>   const char *it = buf, *end = buf + len;
>   for (; it + step <= end;)
>     {
>       vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>       vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>       vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>       vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>
>       asm volatile("nop" ::: "memory");
>       vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
>       vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
>       vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
>       vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
>
>       asm volatile("nop" ::: "memory");
>       size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
>       size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
>       size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
>       size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
>
>       sum += sumation (sum0, sum1, sum2, sum3);
>     }
>   return sum;
> }
>
> Before this patch:
>
> ...
> csrr    t0,vlenb
> ...
>         vwcvt.x.x.v     v16,v8
>         vwcvt.x.x.v     v8,v28
>         vs8r.v  v16,0(sp)               ---> spill
>         vwcvt.x.x.v     v16,v24
>         vwcvt.x.x.v     v24,v4
>         nop
>         vsetvli zero,zero,e16,m8,ta,ma
>         vmv.x.s a2,v16
>         vl8re16.v       v16,0(sp)      --->  reload
> ...
> csrr    t0,vlenb
> ...
>
> You can see heavy spill && reload inside the loop body.
>
> After this patch:
>
> ...
> vwcvt.x.x.v v8,v12
> vwcvt.x.x.v v16,v20
> vwcvt.x.x.v v24,v28
> vwcvt.x.x.v v0,v4
> ...
>
> Optimal codegen after this patch.
>
> Tested on zvl128b no regression.
>
> I am gonna to test zve64d/zvl256b/zvl512b/zvl1024b.
>
> Ok for trunk if no regression on the testing above ?
>
> Co-authored-by: kito-cheng <kito.cheng@sifive.com>
> Co-authored-by: kito-cheng <kito.cheng@gmail.com>
>
> PR target/112431
>
> gcc/ChangeLog:
>
> * config/riscv/constraints.md (TARGET_VECTOR ? V_REGS : NO_REGS): New register filters.
> * config/riscv/riscv.md (no,W21,W42,W84,W41,W81,W82): Ditto.
> (no,yes): Ditto.
> * config/riscv/vector.md: Support highpart register overlap for vwcvt.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/pr112431-1.c: New test.
> * gcc.target/riscv/rvv/base/pr112431-2.c: New test.
> * gcc.target/riscv/rvv/base/pr112431-3.c: New test.
>
> ---
> gcc/config/riscv/constraints.md               |  23 ++++
> gcc/config/riscv/riscv.md                     |  24 ++++
> gcc/config/riscv/vector.md                    |  19 ++--
> .../gcc.target/riscv/rvv/base/pr112431-1.c    | 104 ++++++++++++++++++
> .../gcc.target/riscv/rvv/base/pr112431-2.c    |  68 ++++++++++++
> .../gcc.target/riscv/rvv/base/pr112431-3.c    |  51 +++++++++
> 6 files changed, 280 insertions(+), 9 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
>
> diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
> index 68be4515c04..19bb36616bf 100644
> --- a/gcc/config/riscv/constraints.md
> +++ b/gcc/config/riscv/constraints.md
> @@ -169,6 +169,29 @@
> (define_register_constraint "vm" "TARGET_VECTOR ? VM_REGS : NO_REGS"
>    "A vector mask register (if available).")
> +;; These following constraints are used by RVV instructions with dest EEW > src EEW.
> +;; RISC-V 'V' Spec 5.2. Vector Operands:
> +;; The destination EEW is greater than the source EEW, the source EMUL is at least 1,
> +;; and the overlap is in the highest-numbered part of the destination register group.
> +;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not).
> +(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 2 == 1." "regno % 2 == 1")
> +
> +(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 2." "regno % 4 == 2")
> +
> +(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 4." "regno % 8 == 4")
> +
> +(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 1." "regno % 4 == 1")
> +
> +(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 1." "regno % 8 == 1")
> +
> +(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 2." "regno % 8 == 2")
> +
> ;; This constraint is used to match instruction "csrr %0, vlenb" which is generated in "mov<mode>".
> ;; VLENB is a run-time constant which represent the vector register length in bytes.
> ;; BYTES_PER_RISCV_VECTOR represent runtime invariant of vector register length in bytes.
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 935eeb7fd8e..6bf2dfdf9b4 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -501,6 +501,27 @@
>    ]
>    (const_string "no")))
> +(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
> +  (const_string "no"))
> +
> +(define_attr "vconstraint_enabled" "no,yes"
> +  (cond [(eq_attr "vconstraint" "no")
> +         (const_string "yes")
> +
> +         (and (eq_attr "vconstraint" "W21")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 2"))
> + (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W42,W41")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4"))
> + (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W84,W81,W82")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8"))
> + (const_string "no")
> +        ]
> +       (const_string "yes")))
> +
> ;; Attribute to control enable or disable instructions.
> (define_attr "enabled" "no,yes"
>    (cond [
> @@ -509,6 +530,9 @@
>      (eq_attr "fp_vector_disabled" "yes")
>      (const_string "no")
> +
> +    (eq_attr "vconstraint_enabled" "no")
> +    (const_string "no")
>    ]
>    (const_string "yes")))
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index ba9c9e5a9b6..bace900fee5 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -3898,22 +3898,22 @@
> ;; vwcvt<u>.x.x.v
> (define_insn "@pred_<optab><mode>"
> -  [(set (match_operand:VWEXTI 0 "register_operand"                  "=&vr,&vr")
> +  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")
> (if_then_else:VWEXTI
>   (unspec:<VM>
> -     [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1")
> -      (match_operand 4 "vector_length_operand"              "   rK,   rK")
> -      (match_operand 5 "const_int_operand"                  "    i,    i")
> -      (match_operand 6 "const_int_operand"                  "    i,    i")
> -      (match_operand 7 "const_int_operand"                  "    i,    i")
> +     [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
> +      (match_operand 4 "vector_length_operand"              "   rK,   rK,   rK,   rK,   rK,   rK,   rK,   rK")
> +      (match_operand 5 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
> +      (match_operand 6 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
> +      (match_operand 7 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
>      (reg:SI VL_REGNUM)
>      (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>   (plus:VWEXTI
>     (any_extend:VWEXTI
> -       (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,   vr"))
> +       (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "  W21,  W21,  W42,  W42,  W84,  W84,   vr,   vr"))
>     (vec_duplicate:VWEXTI
>       (reg:<VEL> X0_REGNUM)))
> -   (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0")))]
> +   (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0,   vu,    0,   vu,    0,   vu,    0")))]
>    "TARGET_VECTOR"
>    "vwcvt<u>.x.x.v\t%0,%3%p1"
>    [(set_attr "type" "viwalu")
> @@ -3921,7 +3921,8 @@
>     (set_attr "vl_op_idx" "4")
>     (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[5])"))
>     (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
> -   (set (attr "avl_type_idx") (const_int 7))])
> +   (set (attr "avl_type_idx") (const_int 7))
> +   (set_attr "vconstraint" "W21,W21,W42,W42,W84,W84,no,no")])
> ;; -------------------------------------------------------------------------------
> ;; ---- Predicated integer Narrowing operations
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
> new file mode 100644
> index 00000000000..6b9a7c448f0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
> @@ -0,0 +1,104 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
> +   size_t sum5, size_t sum6, size_t sum7, size_t sum8, size_t sum9,
> +   size_t sum10, size_t sum11, size_t sum12, size_t sum13, size_t sum14,
> +   size_t sum15)
> +{
> +  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8 + sum9
> + + sum10 + sum11 + sum12 + sum13 + sum14 + sum15;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v8 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v9 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v10 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v11 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v12 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v13 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v14 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v15 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint16m2_t vw0 = __riscv_vwcvt_x_x_v_i16m2 (v0, vl);
> +      vint16m2_t vw1 = __riscv_vwcvt_x_x_v_i16m2 (v1, vl);
> +      vint16m2_t vw2 = __riscv_vwcvt_x_x_v_i16m2 (v2, vl);
> +      vint16m2_t vw3 = __riscv_vwcvt_x_x_v_i16m2 (v3, vl);
> +      vint16m2_t vw4 = __riscv_vwcvt_x_x_v_i16m2 (v4, vl);
> +      vint16m2_t vw5 = __riscv_vwcvt_x_x_v_i16m2 (v5, vl);
> +      vint16m2_t vw6 = __riscv_vwcvt_x_x_v_i16m2 (v6, vl);
> +      vint16m2_t vw7 = __riscv_vwcvt_x_x_v_i16m2 (v7, vl);
> +      vint16m2_t vw8 = __riscv_vwcvt_x_x_v_i16m2 (v8, vl);
> +      vint16m2_t vw9 = __riscv_vwcvt_x_x_v_i16m2 (v9, vl);
> +      vint16m2_t vw10 = __riscv_vwcvt_x_x_v_i16m2 (v10, vl);
> +      vint16m2_t vw11 = __riscv_vwcvt_x_x_v_i16m2 (v11, vl);
> +      vint16m2_t vw12 = __riscv_vwcvt_x_x_v_i16m2 (v12, vl);
> +      vint16m2_t vw13 = __riscv_vwcvt_x_x_v_i16m2 (v13, vl);
> +      vint16m2_t vw14 = __riscv_vwcvt_x_x_v_i16m2 (v14, vl);
> +      vint16m2_t vw15 = __riscv_vwcvt_x_x_v_i16m2 (v15, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i16m2_i16 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i16m2_i16 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i16m2_i16 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i16m2_i16 (vw3);
> +      size_t sum4 = __riscv_vmv_x_s_i16m2_i16 (vw4);
> +      size_t sum5 = __riscv_vmv_x_s_i16m2_i16 (vw5);
> +      size_t sum6 = __riscv_vmv_x_s_i16m2_i16 (vw6);
> +      size_t sum7 = __riscv_vmv_x_s_i16m2_i16 (vw7);
> +      size_t sum8 = __riscv_vmv_x_s_i16m2_i16 (vw8);
> +      size_t sum9 = __riscv_vmv_x_s_i16m2_i16 (vw9);
> +      size_t sum10 = __riscv_vmv_x_s_i16m2_i16 (vw10);
> +      size_t sum11 = __riscv_vmv_x_s_i16m2_i16 (vw11);
> +      size_t sum12 = __riscv_vmv_x_s_i16m2_i16 (vw12);
> +      size_t sum13 = __riscv_vmv_x_s_i16m2_i16 (vw13);
> +      size_t sum14 = __riscv_vmv_x_s_i16m2_i16 (vw14);
> +      size_t sum15 = __riscv_vmv_x_s_i16m2_i16 (vw15);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8,
> +        sum9, sum10, sum11, sum12, sum13, sum14, sum15);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
> new file mode 100644
> index 00000000000..da92d59406f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
> @@ -0,0 +1,68 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
> +   size_t sum5, size_t sum6, size_t sum7)
> +{
> +  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v4 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v5 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v6 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v7 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint16m4_t vw0 = __riscv_vwcvt_x_x_v_i16m4 (v0, vl);
> +      vint16m4_t vw1 = __riscv_vwcvt_x_x_v_i16m4 (v1, vl);
> +      vint16m4_t vw2 = __riscv_vwcvt_x_x_v_i16m4 (v2, vl);
> +      vint16m4_t vw3 = __riscv_vwcvt_x_x_v_i16m4 (v3, vl);
> +      vint16m4_t vw4 = __riscv_vwcvt_x_x_v_i16m4 (v4, vl);
> +      vint16m4_t vw5 = __riscv_vwcvt_x_x_v_i16m4 (v5, vl);
> +      vint16m4_t vw6 = __riscv_vwcvt_x_x_v_i16m4 (v6, vl);
> +      vint16m4_t vw7 = __riscv_vwcvt_x_x_v_i16m4 (v7, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i16m4_i16 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i16m4_i16 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i16m4_i16 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i16m4_i16 (vw3);
> +      size_t sum4 = __riscv_vmv_x_s_i16m4_i16 (vw4);
> +      size_t sum5 = __riscv_vmv_x_s_i16m4_i16 (vw5);
> +      size_t sum6 = __riscv_vmv_x_s_i16m4_i16 (vw6);
> +      size_t sum7 = __riscv_vmv_x_s_i16m4_i16 (vw7);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
> new file mode 100644
> index 00000000000..46f93a9049b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
> +{
> +  return sum0 + sum1 + sum2 + sum3;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
> +      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
> +      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
> +      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> --
> 2.36.3
>
  
Robin Dapp Nov. 29, 2023, 11:01 a.m. UTC | #3
Looks like this already went in while I was looking at it...

In general it looks ok to me but I would have really hoped
for some more comments.

> +;; These following constraints are used by RVV instructions with dest EEW > src EEW.
> +;; RISC-V 'V' Spec 5.2. Vector Operands:
> +;; The destination EEW is greater than the source EEW, the source EMUL is at least 1,
> +;; and the overlap is in the highest-numbered part of the destination register group.
> +;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not).
> +(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 2 == 1." "regno % 2 == 1")
> +
> +(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 2." "regno % 4 == 2")
> +
> +(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 4." "regno % 8 == 4")
> +
> +(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 1." "regno % 4 == 1")
> +
> +(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 1." "regno % 8 == 1")
> +
> +(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 2." "regno % 8 == 2")
> +

I can't really match spec and code.  For the lmul = 2 case sure,
but W84 e.g. allows v4 and not v6?  What actually is "highest-numbered part"?

> +(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
> +  (const_string "no"))
> +
> +(define_attr "vconstraint_enabled" "no,yes"
> +  (cond [(eq_attr "vconstraint" "no")
> +         (const_string "yes")
> +
> +         (and (eq_attr "vconstraint" "W21")
> +	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 2"))
> +	 (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W42,W41")
> +	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4"))
> +	 (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W84,W81,W82")
> +	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8"))
> +	 (const_string "no")
> +        ]
> +       (const_string "yes")))
> +

The name vconstraint doesn't say anything.

>  ;; vwcvt<u>.x.x.v
>  (define_insn "@pred_<optab><mode>"
> -  [(set (match_operand:VWEXTI 0 "register_operand"                  "=&vr,&vr")
> +  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")

Regarding earlyclobber.  We still don't consume all inputs before
writing them (there is overlap after all).  Is that the gist of
the change?  Circumventing the earlyclobber by "strategically"
selecting registers?

Why is the disparage necessary?  What happens if it's not there?

Regards
 Robin
  
juzhe.zhong@rivai.ai Nov. 29, 2023, 11:53 a.m. UTC | #4
>> Looks like this already went in while I was looking at it...
I committed it after kito approve that.

>> I can't really match spec and code.  For the lmul = 2 case sure,
>> but W84 e.g. allows v4 and not v6?  What actually is "highest-numbered part"?
Yes.

For vwcvt, LMUL 4 -> LMUL 8. 
We allow overlap  vwcvt v0 (occupy v0 - v7), v4 (occupy v4 - v7)
This patch support the overlap above.
Without this patch, vwcvt is completely non-overlap in any register.

>>The name vconstraint doesn't say anything.
This is kito's code. Could you suggest another name ? I can modify it.

>>Regarding earlyclobber.  We still don't consume all inputs before
>>writing them (there is overlap after all).  Is that the gist of
>>the change?  Circumventing the earlyclobber by "strategically"
>>selecting registers?
 
>>Why is the disparage necessary?  What happens if it's not there?

I experiment with many tests, turns out adding ? generate better codegen.
You can try it (remove ?) and testing it on case (I added in this patch).



juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-11-29 19:01
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support highpart register overlap for vwcvt
Looks like this already went in while I was looking at it...
 
In general it looks ok to me but I would have really hoped
for some more comments.
 
> +;; These following constraints are used by RVV instructions with dest EEW > src EEW.
> +;; RISC-V 'V' Spec 5.2. Vector Operands:
> +;; The destination EEW is greater than the source EEW, the source EMUL is at least 1,
> +;; and the overlap is in the highest-numbered part of the destination register group.
> +;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not).
> +(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 2 == 1." "regno % 2 == 1")
> +
> +(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 2." "regno % 4 == 2")
> +
> +(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 4." "regno % 8 == 4")
> +
> +(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 1." "regno % 4 == 1")
> +
> +(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 1." "regno % 8 == 1")
> +
> +(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 2." "regno % 8 == 2")
> +
 
I can't really match spec and code.  For the lmul = 2 case sure,
but W84 e.g. allows v4 and not v6?  What actually is "highest-numbered part"?
 
> +(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
> +  (const_string "no"))
> +
> +(define_attr "vconstraint_enabled" "no,yes"
> +  (cond [(eq_attr "vconstraint" "no")
> +         (const_string "yes")
> +
> +         (and (eq_attr "vconstraint" "W21")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 2"))
> + (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W42,W41")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4"))
> + (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W84,W81,W82")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8"))
> + (const_string "no")
> +        ]
> +       (const_string "yes")))
> +
 
The name vconstraint doesn't say anything.
 
>  ;; vwcvt<u>.x.x.v
>  (define_insn "@pred_<optab><mode>"
> -  [(set (match_operand:VWEXTI 0 "register_operand"                  "=&vr,&vr")
> +  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")
 
Regarding earlyclobber.  We still don't consume all inputs before
writing them (there is overlap after all).  Is that the gist of
the change?  Circumventing the earlyclobber by "strategically"
selecting registers?
 
Why is the disparage necessary?  What happens if it's not there?
 
Regards
Robin
  
Robin Dapp Nov. 29, 2023, 1:30 p.m. UTC | #5
>>> I can't really match spec and code.  For the lmul = 2 case sure,
>>> but W84 e.g. allows v4 and not v6?  What actually is "highest-numbered part"?
> Yes.
> 
> For vwcvt, LMUL 4 -> LMUL 8. 
> We allow overlap  vwcvt v0 (occupy v0 - v7), v4 (occupy v4 - v7)
> This patch support the overlap above.

Ok thanks, that way it makes sense.  The allowed overlap size is the
size of the source group which is determined by the "extension factor".

But don't we allow e.g. v2 and v4 with W82?  Shouldn't it be % 8 == 6
and % 8 == 7 for W82 and W81? Or for W41, % 4 == 3?  At least when looking
at the given spec example that would correspond to W82?

> This is kito's code. Could you suggest another name ? I can modify it.

overlap or group_overlap.  Then change "no" to "none" and rename
"vconstraint_enabled" to "group_overlap_valid" (or without the group).

Add a comment to group_overlap_valid:

; Widening instructions have group-overlap constraints.  Those are only
; valid for certain register-group sizes.  This attribute marks the
; alternatives not matching the required register-group size as disabled.


> I experiment with many tests, turns out adding ? generate better codegen.
> You can try it (remove ?) and testing it on case (I added in this patch).

It looks like we spill without it but I don't get why.  Well, as long
as it works, I guess we can defer that question.

Regards
 Robin
  
juzhe.zhong@rivai.ai Nov. 29, 2023, 1:37 p.m. UTC | #6
>> But don't we allow e.g. v2 and v4 with W82?  Shouldn't it be % 8 == 6
>> and % 8 == 7 for W82 and W81? Or for W41, % 4 == 3?  At least when looking
>> at the given spec example that would correspond to W82?

I think you are right.  It should be W86 for vsext.vf4 (LMUL2 -> LMUL8)
W87 for vsext.vf8 (LMUL1->LMUL8)
W43 for vsext.vf4 (LMUL1->LMUL4)

This patch is just only using W21,W42,W84.
Will adapt that in the following patches.



juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-11-29 21:30
To: 钟居哲; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; Jeff Law
Subject: Re: [PATCH] RISC-V: Support highpart register overlap for vwcvt
>>> I can't really match spec and code.  For the lmul = 2 case sure,
>>> but W84 e.g. allows v4 and not v6?  What actually is "highest-numbered part"?
> Yes.
> 
> For vwcvt, LMUL 4 -> LMUL 8. 
> We allow overlap  vwcvt v0 (occupy v0 - v7), v4 (occupy v4 - v7)
> This patch support the overlap above.
 
Ok thanks, that way it makes sense.  The allowed overlap size is the
size of the source group which is determined by the "extension factor".
 
But don't we allow e.g. v2 and v4 with W82?  Shouldn't it be % 8 == 6
and % 8 == 7 for W82 and W81? Or for W41, % 4 == 3?  At least when looking
at the given spec example that would correspond to W82?
 
> This is kito's code. Could you suggest another name ? I can modify it.
 
overlap or group_overlap.  Then change "no" to "none" and rename
"vconstraint_enabled" to "group_overlap_valid" (or without the group).
 
Add a comment to group_overlap_valid:
 
; Widening instructions have group-overlap constraints.  Those are only
; valid for certain register-group sizes.  This attribute marks the
; alternatives not matching the required register-group size as disabled.
 
 
> I experiment with many tests, turns out adding ? generate better codegen.
> You can try it (remove ?) and testing it on case (I added in this patch).
 
It looks like we spill without it but I don't get why.  Well, as long
as it works, I guess we can defer that question.
 
Regards
Robin
  
juzhe.zhong@rivai.ai Nov. 29, 2023, 1:42 p.m. UTC | #7
>> overlap or group_overlap.  Then change "no" to "none" and rename
>> "vconstraint_enabled" to "group_overlap_valid" (or without the group).

>> Add a comment to group_overlap_valid:

>> ; Widening instructions have group-overlap constraints.  Those are only
>> ; valid for certain register-group sizes.  This attribute marks the
>> ; alternatives not matching the required register-group size as disabled.

Ok will send a patch to fix them. Thanks.


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-11-29 21:30
To: 钟居哲; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; Jeff Law
Subject: Re: [PATCH] RISC-V: Support highpart register overlap for vwcvt
>>> I can't really match spec and code.  For the lmul = 2 case sure,
>>> but W84 e.g. allows v4 and not v6?  What actually is "highest-numbered part"?
> Yes.
> 
> For vwcvt, LMUL 4 -> LMUL 8. 
> We allow overlap  vwcvt v0 (occupy v0 - v7), v4 (occupy v4 - v7)
> This patch support the overlap above.
 
Ok thanks, that way it makes sense.  The allowed overlap size is the
size of the source group which is determined by the "extension factor".
 
But don't we allow e.g. v2 and v4 with W82?  Shouldn't it be % 8 == 6
and % 8 == 7 for W82 and W81? Or for W41, % 4 == 3?  At least when looking
at the given spec example that would correspond to W82?
 
> This is kito's code. Could you suggest another name ? I can modify it.
 
overlap or group_overlap.  Then change "no" to "none" and rename
"vconstraint_enabled" to "group_overlap_valid" (or without the group).
 
Add a comment to group_overlap_valid:
 
; Widening instructions have group-overlap constraints.  Those are only
; valid for certain register-group sizes.  This attribute marks the
; alternatives not matching the required register-group size as disabled.
 
 
> I experiment with many tests, turns out adding ? generate better codegen.
> You can try it (remove ?) and testing it on case (I added in this patch).
 
It looks like we spill without it but I don't get why.  Well, as long
as it works, I guess we can defer that question.
 
Regards
Robin
  

Patch

diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 68be4515c04..19bb36616bf 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -169,6 +169,29 @@ 
 (define_register_constraint "vm" "TARGET_VECTOR ? VM_REGS : NO_REGS"
   "A vector mask register (if available).")
 
+;; These following constraints are used by RVV instructions with dest EEW > src EEW.
+;; RISC-V 'V' Spec 5.2. Vector Operands:
+;; The destination EEW is greater than the source EEW, the source EMUL is at least 1,
+;; and the overlap is in the highest-numbered part of the destination register group.
+;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not).
+(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 2 == 1." "regno % 2 == 1")
+
+(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 4 == 2." "regno % 4 == 2")
+
+(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 4." "regno % 8 == 4")
+
+(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 4 == 1." "regno % 4 == 1")
+
+(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 1." "regno % 8 == 1")
+
+(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 2." "regno % 8 == 2")
+
 ;; This constraint is used to match instruction "csrr %0, vlenb" which is generated in "mov<mode>".
 ;; VLENB is a run-time constant which represent the vector register length in bytes.
 ;; BYTES_PER_RISCV_VECTOR represent runtime invariant of vector register length in bytes.
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 935eeb7fd8e..6bf2dfdf9b4 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -501,6 +501,27 @@ 
   ]
   (const_string "no")))
 
+(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
+  (const_string "no"))
+
+(define_attr "vconstraint_enabled" "no,yes"
+  (cond [(eq_attr "vconstraint" "no")
+         (const_string "yes")
+
+         (and (eq_attr "vconstraint" "W21")
+	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 2"))
+	 (const_string "no")
+
+         (and (eq_attr "vconstraint" "W42,W41")
+	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4"))
+	 (const_string "no")
+
+         (and (eq_attr "vconstraint" "W84,W81,W82")
+	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8"))
+	 (const_string "no")
+        ]
+       (const_string "yes")))
+
 ;; Attribute to control enable or disable instructions.
 (define_attr "enabled" "no,yes"
   (cond [
@@ -509,6 +530,9 @@ 
 
     (eq_attr "fp_vector_disabled" "yes")
     (const_string "no")
+
+    (eq_attr "vconstraint_enabled" "no")
+    (const_string "no")
   ]
   (const_string "yes")))
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index ba9c9e5a9b6..bace900fee5 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3898,22 +3898,22 @@ 
 
 ;; vwcvt<u>.x.x.v
 (define_insn "@pred_<optab><mode>"
-  [(set (match_operand:VWEXTI 0 "register_operand"                  "=&vr,&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")
 	(if_then_else:VWEXTI
 	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1")
-	     (match_operand 4 "vector_length_operand"              "   rK,   rK")
-	     (match_operand 5 "const_int_operand"                  "    i,    i")
-	     (match_operand 6 "const_int_operand"                  "    i,    i")
-	     (match_operand 7 "const_int_operand"                  "    i,    i")
+	    [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
+	     (match_operand 4 "vector_length_operand"              "   rK,   rK,   rK,   rK,   rK,   rK,   rK,   rK")
+	     (match_operand 5 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
+	     (match_operand 6 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
+	     (match_operand 7 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
 	  (plus:VWEXTI
 	    (any_extend:VWEXTI
-	      (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,   vr"))
+	      (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "  W21,  W21,  W42,  W42,  W84,  W84,   vr,   vr"))
 	    (vec_duplicate:VWEXTI
 	      (reg:<VEL> X0_REGNUM)))
-	  (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0")))]
+	  (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0,   vu,    0,   vu,    0,   vu,    0")))]
   "TARGET_VECTOR"
   "vwcvt<u>.x.x.v\t%0,%3%p1"
   [(set_attr "type" "viwalu")
@@ -3921,7 +3921,8 @@ 
    (set_attr "vl_op_idx" "4")
    (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[5])"))
    (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
-   (set (attr "avl_type_idx") (const_int 7))])
+   (set (attr "avl_type_idx") (const_int 7))
+   (set_attr "vconstraint" "W21,W21,W42,W42,W84,W84,no,no")])
 
 ;; -------------------------------------------------------------------------------
 ;; ---- Predicated integer Narrowing operations
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
new file mode 100644
index 00000000000..6b9a7c448f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
@@ -0,0 +1,104 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+	  size_t sum5, size_t sum6, size_t sum7, size_t sum8, size_t sum9,
+	  size_t sum10, size_t sum11, size_t sum12, size_t sum13, size_t sum14,
+	  size_t sum15)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8 + sum9
+	 + sum10 + sum11 + sum12 + sum13 + sum14 + sum15;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v8 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v9 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v10 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v11 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v12 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v13 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v14 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v15 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      
+      asm volatile("nop" ::: "memory");
+      vint16m2_t vw0 = __riscv_vwcvt_x_x_v_i16m2 (v0, vl);
+      vint16m2_t vw1 = __riscv_vwcvt_x_x_v_i16m2 (v1, vl);
+      vint16m2_t vw2 = __riscv_vwcvt_x_x_v_i16m2 (v2, vl);
+      vint16m2_t vw3 = __riscv_vwcvt_x_x_v_i16m2 (v3, vl);
+      vint16m2_t vw4 = __riscv_vwcvt_x_x_v_i16m2 (v4, vl);
+      vint16m2_t vw5 = __riscv_vwcvt_x_x_v_i16m2 (v5, vl);
+      vint16m2_t vw6 = __riscv_vwcvt_x_x_v_i16m2 (v6, vl);
+      vint16m2_t vw7 = __riscv_vwcvt_x_x_v_i16m2 (v7, vl);
+      vint16m2_t vw8 = __riscv_vwcvt_x_x_v_i16m2 (v8, vl);
+      vint16m2_t vw9 = __riscv_vwcvt_x_x_v_i16m2 (v9, vl);
+      vint16m2_t vw10 = __riscv_vwcvt_x_x_v_i16m2 (v10, vl);
+      vint16m2_t vw11 = __riscv_vwcvt_x_x_v_i16m2 (v11, vl);
+      vint16m2_t vw12 = __riscv_vwcvt_x_x_v_i16m2 (v12, vl);
+      vint16m2_t vw13 = __riscv_vwcvt_x_x_v_i16m2 (v13, vl);
+      vint16m2_t vw14 = __riscv_vwcvt_x_x_v_i16m2 (v14, vl);
+      vint16m2_t vw15 = __riscv_vwcvt_x_x_v_i16m2 (v15, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m2_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m2_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m2_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m2_i16 (vw3);
+      size_t sum4 = __riscv_vmv_x_s_i16m2_i16 (vw4);
+      size_t sum5 = __riscv_vmv_x_s_i16m2_i16 (vw5);
+      size_t sum6 = __riscv_vmv_x_s_i16m2_i16 (vw6);
+      size_t sum7 = __riscv_vmv_x_s_i16m2_i16 (vw7);
+      size_t sum8 = __riscv_vmv_x_s_i16m2_i16 (vw8);
+      size_t sum9 = __riscv_vmv_x_s_i16m2_i16 (vw9);
+      size_t sum10 = __riscv_vmv_x_s_i16m2_i16 (vw10);
+      size_t sum11 = __riscv_vmv_x_s_i16m2_i16 (vw11);
+      size_t sum12 = __riscv_vmv_x_s_i16m2_i16 (vw12);
+      size_t sum13 = __riscv_vmv_x_s_i16m2_i16 (vw13);
+      size_t sum14 = __riscv_vmv_x_s_i16m2_i16 (vw14);
+      size_t sum15 = __riscv_vmv_x_s_i16m2_i16 (vw15);
+
+      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8,
+		       sum9, sum10, sum11, sum12, sum13, sum14, sum15);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
new file mode 100644
index 00000000000..da92d59406f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
@@ -0,0 +1,68 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+	  size_t sum5, size_t sum6, size_t sum7)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v4 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v5 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v6 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v7 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+
+      asm volatile("nop" ::: "memory");
+      vint16m4_t vw0 = __riscv_vwcvt_x_x_v_i16m4 (v0, vl);
+      vint16m4_t vw1 = __riscv_vwcvt_x_x_v_i16m4 (v1, vl);
+      vint16m4_t vw2 = __riscv_vwcvt_x_x_v_i16m4 (v2, vl);
+      vint16m4_t vw3 = __riscv_vwcvt_x_x_v_i16m4 (v3, vl);
+      vint16m4_t vw4 = __riscv_vwcvt_x_x_v_i16m4 (v4, vl);
+      vint16m4_t vw5 = __riscv_vwcvt_x_x_v_i16m4 (v5, vl);
+      vint16m4_t vw6 = __riscv_vwcvt_x_x_v_i16m4 (v6, vl);
+      vint16m4_t vw7 = __riscv_vwcvt_x_x_v_i16m4 (v7, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m4_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m4_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m4_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m4_i16 (vw3);
+      size_t sum4 = __riscv_vmv_x_s_i16m4_i16 (vw4);
+      size_t sum5 = __riscv_vmv_x_s_i16m4_i16 (vw5);
+      size_t sum6 = __riscv_vmv_x_s_i16m4_i16 (vw6);
+      size_t sum7 = __riscv_vmv_x_s_i16m4_i16 (vw7);
+
+      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
new file mode 100644
index 00000000000..46f93a9049b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
@@ -0,0 +1,51 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
+{
+  return sum0 + sum1 + sum2 + sum3;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+
+      asm volatile("nop" ::: "memory");
+      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
+      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
+      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
+      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
+
+      sum += sumation (sum0, sum1, sum2, sum3);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */