RISC-V: Avoid updating vl right before branching on avl
Checks
Context |
Check |
Description |
rivoscibot/toolchain-ci-rivos-lint |
warning
|
Lint failed
|
rivoscibot/toolchain-ci-rivos-apply-patch |
success
|
Patch applied
|
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_simplebootstrap_build--master-arm-bootstrap |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-multilib |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
rivoscibot/toolchain-ci-rivos-test |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_simplebootstrap_build--master-aarch64-bootstrap |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
See [1] thread for original patch which spawned this one.
We are currently seeing the following code where we perform a vsetvl
before a branching instruction against the avl.
vsetvli a5,a1,e32,m1,tu,ma
vle32.v v2,0(a0)
sub a1,a1,a5 <-- a1 potentially set to 0
sh2add a0,a5,a0
vfmacc.vv v1,v2,v2
vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
beq a1,zero,.L12 <-- check if avl is 0
Since we are branching off of the avl, we don't need to update vl until
after the branch is taken. Search the ready queue for vsetvls scheduled
before branching instructions that branch off of the same regno and
promote the branches to execute first. This can improve performancy by
potentially avoiding setting VL=0 which may be expensive on some uarches.
[1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
PR/117974
gcc/ChangeLog:
* config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
(insn_increases_zeroness_p): Ditto.
(riscv_promote_ready): Ditto.
(riscv_sched_reorder): Implement hook.
(TARGET_SCHED_REORDER): Define Hook.
* config/riscv/riscv.opt: New flag.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
Signed-off-by: Edwin Lu <ewlu@rivosinc.com>
Co-authored-by: Palmer Dabbelt <palmer@rivosinc.com>
---
gcc/config/riscv/riscv.cc | 103 ++++++++++++++++++
gcc/config/riscv/riscv.opt | 4 +
.../gcc.target/riscv/rvv/vsetvl/pr117974.c | 15 +++
3 files changed, 122 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
--
2.43.0
Comments
On 2/24/25 16:07, Edwin Lu wrote:
> See [1] thread for original patch which spawned this one.
>
> We are currently seeing the following code where we perform a vsetvl
> before a branching instruction against the avl.
>
> vsetvli a5,a1,e32,m1,tu,ma
> vle32.v v2,0(a0)
> sub a1,a1,a5 <-- a1 potentially set to 0
> sh2add a0,a5,a0
> vfmacc.vv v1,v2,v2
> vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
> beq a1,zero,.L12 <-- check if avl is 0
>
> Since we are branching off of the avl, we don't need to update vl until
> after the branch is taken. Search the ready queue for vsetvls scheduled
> before branching instructions that branch off of the same regno and
> promote the branches to execute first. This can improve performancy by
> potentially avoiding setting VL=0 which may be expensive on some uarches.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>
> PR/117974
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
> (insn_increases_zeroness_p): Ditto.
> (riscv_promote_ready): Ditto.
> (riscv_sched_reorder): Implement hook.
> (TARGET_SCHED_REORDER): Define Hook.
> * config/riscv/riscv.opt: New flag.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
>
> Signed-off-by: Edwin Lu <ewlu@rivosinc.com>
> Co-authored-by: Palmer Dabbelt <palmer@rivosinc.com>
> ---
> gcc/config/riscv/riscv.cc | 103 ++++++++++++++++++
> gcc/config/riscv/riscv.opt | 4 +
> .../gcc.target/riscv/rvv/vsetvl/pr117974.c | 15 +++
> 3 files changed, 122 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 89aa25d5da9..cf0866fa3fb 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -14035,6 +14035,106 @@ bool need_shadow_stack_push_pop_p ()
> return is_zicfiss_p () && riscv_save_return_addr_reg_p ();
> }
>
> +static int
> +vsetvl_avl_regno(rtx_insn *insn)
> +{
> + if (recog_memoized (insn) < 0)
> + return -1;
> +
> + if (get_attr_type (insn) != TYPE_VSETVL
> + && get_attr_type (insn) != TYPE_VSETVL_PRE)
> + return -1;
> +
> + extract_insn (insn);
> + /* From vector.md, vsetvl operands are as follows:
> + ;; operands[0]: VL.
> + ;; operands[1]: AVL.
> + ;; operands[2]: SEW
> + ;; operands[3]: LMUL
> + ;; operands[4]: Tail policy 0 or 1 (undisturbed/agnostic)
> + ;; operands[5]: Mask policy 0 or 1 (undisturbed/agnostic)
> + Return regno of avl operand. */
> + return REGNO (recog_data.operand[1]);
> +}
> +
> +static bool
> +insn_increases_zeroness_p(rtx_insn *insn, int regno)
Mnir point, but this is just one case of introducing VL=0 zeroness specific to
branch, not the general case.
> +{
> + /* Check for branching against zero. */
> + if (JUMP_P (insn))
> + {
> + extract_insn (insn);
> + bool match_reg = false;
> + bool comp_zero = false;
> + for (int i = 0; i < recog_data.n_operands; i++)
> + {
> + if (REG_P (recog_data.operand[i])
> + && REGNO (recog_data.operand[i]) == regno)
> + match_reg = true;
> + if (CONST_INT_P (recog_data.operand[i])
> + && XINT (recog_data.operand[i], 0) == 0
> + && XWINT (recog_data.operand[i], 0) == 0)
> + comp_zero = true;
> + }
> + return match_reg && comp_zero;
> + }
> + return false;
> +}
> +
> +/* Copied from MIPS. Removes the instruction at index LOWER from ready
> + queue READY and reinserts it in from of the instruction at index
> + HIGHER. LOWER must be <= HIGHER. */
> +static void
> +riscv_promote_ready (rtx_insn **ready, int lower, int higher)
> +{
> + rtx_insn *new_head;
> + int i;
> +
> + new_head = ready[lower];
> + for (i = lower; i < higher; i++)
> + ready[i] = ready[i + 1];
> + ready[i] = new_head;
> +}
> +
> +/* Attempt to avoid issuing VSETVL-type instructions before a branch that
> + ensures they are non-zero, as setting VL=0 dynamically can be slow. */
> +static int
> +riscv_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
> + rtx_insn **ready, int *nreadyp, int cycle ATTRIBUTE_UNUSED)
> +{
> + if (! TARGET_AVOID_VL_EQ_0)
> + return riscv_issue_rate ();
> +
> + for (int i = *nreadyp - 1; i >= 0; i--)
> + {
> + /* Find the vsetvl. */
> + int avl_regno = vsetvl_avl_regno (ready[i]);
> + if (avl_regno == -1 || i == 0)
> + continue;
> + for (int j = i - 1; j >= 0; j--)
> + {
> + /* Exit if another vsetvl is found before finding a branch insn
> + in the ready queue. */
> + if (recog_memoized (ready[j]) >= 0
> + && get_attr_type (ready[j]) == TYPE_VSETVL
> + && get_attr_type (ready[j]) == TYPE_VSETVL_PRE)
> + break;
> + /* Find branch. */
> + if (recog_memoized (ready[j]) >= 0
> + && insn_increases_zeroness_p (ready[j], avl_regno))
> + {
> + /* Right now the only zeroness-increasing pattern we recognize
> + is a branch-not-zero, so there's no sense in looking for any
> + more zeroness at that point. */
> + riscv_promote_ready (ready, j, i);
> + break;
> + }
> + }
> + }
> +
> + return riscv_issue_rate ();
> +}
> +
> /* Initialize the GCC target structure. */
> #undef TARGET_ASM_ALIGNED_HI_OP
> #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -14430,6 +14530,9 @@ bool need_shadow_stack_push_pop_p ()
> #undef TARGET_DOCUMENTATION_NAME
> #define TARGET_DOCUMENTATION_NAME "RISC-V"
>
> +#undef TARGET_SCHED_REORDER
> +#define TARGET_SCHED_REORDER riscv_sched_reorder
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> #include "gt-riscv.h"
> diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> index 7515c8ea13d..c6cab61fdc0 100644
> --- a/gcc/config/riscv/riscv.opt
> +++ b/gcc/config/riscv/riscv.opt
> @@ -681,3 +681,7 @@ Specifies whether the fence.tso instruction should be used.
> mautovec-segment
> Target Integer Var(riscv_mautovec_segment) Init(1)
> Enable (default) or disable generation of vector segment load/store instructions.
> +
> +mavoid-vl0
> +Target Var(TARGET_AVOID_VL_EQ_0) Init(1)
> +Avoid (default) code that dynamically sets VL=0 where possible.
As stated above, this is not the general case of VL=0 avoidance but intersection
of VL=0 and a branch, so better to phrase it that way,
Also as others have said in earlier threads, this would be better as a cpu tune,
specifically part of vector tuning, maybe default for OoO tune.
Otherwise looks pretty cool !
-Vineet
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> new file mode 100644
> index 00000000000..275922eb0bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl -Ofast" } */
> +
> +float g(float q[], int N){
> + float dqnorm = 0.0;
> +
> + #pragma GCC unroll 4
> +
> + for (int i=0; i < N; i++) {
> + dqnorm = dqnorm + q[i] * q[i];
> + }
> + return dqnorm;
> +}
> +
> +/* { dg-final { scan-assembler-times {beq\s+[a-x0-9]+,zero,.L12\s+vsetvli} 3 } } */
> --
> 2.43.0
>
On 2/24/25 5:07 PM, Edwin Lu wrote:
> See [1] thread for original patch which spawned this one.
>
> We are currently seeing the following code where we perform a vsetvl
> before a branching instruction against the avl.
>
> vsetvli a5,a1,e32,m1,tu,ma
> vle32.v v2,0(a0)
> sub a1,a1,a5 <-- a1 potentially set to 0
> sh2add a0,a5,a0
> vfmacc.vv v1,v2,v2
> vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
> beq a1,zero,.L12 <-- check if avl is 0
>
> Since we are branching off of the avl, we don't need to update vl until
> after the branch is taken. Search the ready queue for vsetvls scheduled
> before branching instructions that branch off of the same regno and
> promote the branches to execute first. This can improve performancy by
> potentially avoiding setting VL=0 which may be expensive on some uarches.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>
> PR/117974
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
> (insn_increases_zeroness_p): Ditto.
> (riscv_promote_ready): Ditto.
> (riscv_sched_reorder): Implement hook.
> (TARGET_SCHED_REORDER): Define Hook.
> * config/riscv/riscv.opt: New flag.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
So I preferred the earlier approach of disabling speculation of the
vsetvls, though I'm guessing you're looking at this approach because
that was insufficient?
Regardless, like the disabling of speculation I tend to think this
should be controlled by an entry in the tuning structure rather than
flags. Flags are fine for internal testing, but I don't see a
compelling need for a flag to control this in general.
Jeff
On 2/24/2025 4:34 PM, Jeff Law wrote:
>
>
> On 2/24/25 5:07 PM, Edwin Lu wrote:
>> See [1] thread for original patch which spawned this one.
>>
>> We are currently seeing the following code where we perform a vsetvl
>> before a branching instruction against the avl.
>>
>> vsetvli a5,a1,e32,m1,tu,ma
>> vle32.v v2,0(a0)
>> sub a1,a1,a5 <-- a1 potentially set to 0
>> sh2add a0,a5,a0
>> vfmacc.vv v1,v2,v2
>> vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl
>> to 0
>> beq a1,zero,.L12 <-- check if avl is 0
>>
>> Since we are branching off of the avl, we don't need to update vl until
>> after the branch is taken. Search the ready queue for vsetvls scheduled
>> before branching instructions that branch off of the same regno and
>> promote the branches to execute first. This can improve performancy by
>> potentially avoiding setting VL=0 which may be expensive on some
>> uarches.
>>
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>>
>> PR/117974
>>
>> gcc/ChangeLog:
>>
>> * config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
>> (insn_increases_zeroness_p): Ditto.
>> (riscv_promote_ready): Ditto.
>> (riscv_sched_reorder): Implement hook.
>> (TARGET_SCHED_REORDER): Define Hook.
>> * config/riscv/riscv.opt: New flag.
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
> So I preferred the earlier approach of disabling speculation of the
> vsetvls, though I'm guessing you're looking at this approach because
> that was insufficient?
I don't think that it was because it was insufficient, but that it might
be too constraining. I'm not exactly certain if there is the possibility
where speculatively issuing some vsetvls in some situations could
improve perf. I think with this patch, it targets the specific issue
we're facing directly which is updating vl=0 when it's unnecessary. I
don't know which patch would be better so I'm putting this out there as
a potential alternative.
> Regardless, like the disabling of speculation I tend to think this
> should be controlled by an entry in the tuning structure rather than
> flags. Flags are fine for internal testing, but I don't see a
> compelling need for a flag to control this in general.
>
> Jeff
That makes sense to me. I completely forgot that we had talked about
putting it into the tuning structure when writing this. I'll update the
tunes respectively in a later version of the patches depending on which
one moves forward.
Edwin
On 2/24/25 6:10 PM, Edwin Lu wrote:
>> So I preferred the earlier approach of disabling speculation of the
>> vsetvls, though I'm guessing you're looking at this approach because
>> that was insufficient?
> I don't think that it was because it was insufficient, but that it might
> be too constraining. I'm not exactly certain if there is the possibility
> where speculatively issuing some vsetvls in some situations could
> improve perf.
On an out of order core I would expect the hardware to do the
speculation and doing it in the compiler is of marginal value, if any.
I think with this patch, it targets the specific issue
> we're facing directly which is updating vl=0 when it's unnecessary. I
> don't know which patch would be better so I'm putting this out there as
> a potential alternative.
I suspect (but haven't verified) that the only time you'll see the
conditional branch and the vsetvl in the ready queue together is in the
speculative case. Now it may be the case that you can be more targeted
in inhibiting speculation, but again, for an out of order core I don't
think it's actually going to matter.
Essentially when we aren't speculating the conditional branch will be
dependent on all the instructions in the block. So the conditional
branch won't go into the ready queue until after everything else has issued.
Given we're not planning to move forward until stage1 reopens, you've
got time to play with it internally. I've also got a mental todo to
check with our designers to see if we have any sensitivity to this issue.
jeff
@@ -14035,6 +14035,106 @@ bool need_shadow_stack_push_pop_p ()
return is_zicfiss_p () && riscv_save_return_addr_reg_p ();
}
+static int
+vsetvl_avl_regno(rtx_insn *insn)
+{
+ if (recog_memoized (insn) < 0)
+ return -1;
+
+ if (get_attr_type (insn) != TYPE_VSETVL
+ && get_attr_type (insn) != TYPE_VSETVL_PRE)
+ return -1;
+
+ extract_insn (insn);
+ /* From vector.md, vsetvl operands are as follows:
+ ;; operands[0]: VL.
+ ;; operands[1]: AVL.
+ ;; operands[2]: SEW
+ ;; operands[3]: LMUL
+ ;; operands[4]: Tail policy 0 or 1 (undisturbed/agnostic)
+ ;; operands[5]: Mask policy 0 or 1 (undisturbed/agnostic)
+ Return regno of avl operand. */
+ return REGNO (recog_data.operand[1]);
+}
+
+static bool
+insn_increases_zeroness_p(rtx_insn *insn, int regno)
+{
+ /* Check for branching against zero. */
+ if (JUMP_P (insn))
+ {
+ extract_insn (insn);
+ bool match_reg = false;
+ bool comp_zero = false;
+ for (int i = 0; i < recog_data.n_operands; i++)
+ {
+ if (REG_P (recog_data.operand[i])
+ && REGNO (recog_data.operand[i]) == regno)
+ match_reg = true;
+ if (CONST_INT_P (recog_data.operand[i])
+ && XINT (recog_data.operand[i], 0) == 0
+ && XWINT (recog_data.operand[i], 0) == 0)
+ comp_zero = true;
+ }
+ return match_reg && comp_zero;
+ }
+ return false;
+}
+
+/* Copied from MIPS. Removes the instruction at index LOWER from ready
+ queue READY and reinserts it in from of the instruction at index
+ HIGHER. LOWER must be <= HIGHER. */
+static void
+riscv_promote_ready (rtx_insn **ready, int lower, int higher)
+{
+ rtx_insn *new_head;
+ int i;
+
+ new_head = ready[lower];
+ for (i = lower; i < higher; i++)
+ ready[i] = ready[i + 1];
+ ready[i] = new_head;
+}
+
+/* Attempt to avoid issuing VSETVL-type instructions before a branch that
+ ensures they are non-zero, as setting VL=0 dynamically can be slow. */
+static int
+riscv_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
+ rtx_insn **ready, int *nreadyp, int cycle ATTRIBUTE_UNUSED)
+{
+ if (! TARGET_AVOID_VL_EQ_0)
+ return riscv_issue_rate ();
+
+ for (int i = *nreadyp - 1; i >= 0; i--)
+ {
+ /* Find the vsetvl. */
+ int avl_regno = vsetvl_avl_regno (ready[i]);
+ if (avl_regno == -1 || i == 0)
+ continue;
+ for (int j = i - 1; j >= 0; j--)
+ {
+ /* Exit if another vsetvl is found before finding a branch insn
+ in the ready queue. */
+ if (recog_memoized (ready[j]) >= 0
+ && get_attr_type (ready[j]) == TYPE_VSETVL
+ && get_attr_type (ready[j]) == TYPE_VSETVL_PRE)
+ break;
+ /* Find branch. */
+ if (recog_memoized (ready[j]) >= 0
+ && insn_increases_zeroness_p (ready[j], avl_regno))
+ {
+ /* Right now the only zeroness-increasing pattern we recognize
+ is a branch-not-zero, so there's no sense in looking for any
+ more zeroness at that point. */
+ riscv_promote_ready (ready, j, i);
+ break;
+ }
+ }
+ }
+
+ return riscv_issue_rate ();
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -14430,6 +14530,9 @@ bool need_shadow_stack_push_pop_p ()
#undef TARGET_DOCUMENTATION_NAME
#define TARGET_DOCUMENTATION_NAME "RISC-V"
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER riscv_sched_reorder
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
@@ -681,3 +681,7 @@ Specifies whether the fence.tso instruction should be used.
mautovec-segment
Target Integer Var(riscv_mautovec_segment) Init(1)
Enable (default) or disable generation of vector segment load/store instructions.
+
+mavoid-vl0
+Target Var(TARGET_AVOID_VL_EQ_0) Init(1)
+Avoid (default) code that dynamically sets VL=0 where possible.
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl -Ofast" } */
+
+float g(float q[], int N){
+ float dqnorm = 0.0;
+
+ #pragma GCC unroll 4
+
+ for (int i=0; i < N; i++) {
+ dqnorm = dqnorm + q[i] * q[i];
+ }
+ return dqnorm;
+}
+
+/* { dg-final { scan-assembler-times {beq\s+[a-x0-9]+,zero,.L12\s+vsetvli} 3 } } */