RISC-V: Avoid updating vl right before branching on avl

Message ID 20250225000956.2729050-1-ewlu@rivosinc.com
State Deferred
Headers
Series RISC-V: Avoid updating vl right before branching on avl |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-lint warning Lint failed
rivoscibot/toolchain-ci-rivos-apply-patch success Patch applied
rivoscibot/toolchain-ci-rivos-build--newlib-rv64gcv-lp64d-multilib success Build passed
linaro-tcwg-bot/tcwg_simplebootstrap_build--master-arm-bootstrap success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gcv-lp64d-multilib success Build passed
rivoscibot/toolchain-ci-rivos-build--linux-rv64gc_zba_zbb_zbc_zbs-lp64d-multilib success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
rivoscibot/toolchain-ci-rivos-test success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_simplebootstrap_build--master-aarch64-bootstrap success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed

Commit Message

Edwin Lu Feb. 25, 2025, 12:07 a.m. UTC
  See [1] thread for original patch which spawned this one.

We are currently seeing the following code where we perform a vsetvl
before a branching instruction against the avl.

        vsetvli a5,a1,e32,m1,tu,ma
        vle32.v v2,0(a0)
        sub     a1,a1,a5 <-- a1 potentially set to 0
        sh2add  a0,a5,a0
        vfmacc.vv       v1,v2,v2
        vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
        beq     a1,zero,.L12 <-- check if avl is 0

Since we are branching off of the avl, we don't need to update vl until
after the branch is taken. Search the ready queue for vsetvls scheduled
before branching instructions that branch off of the same regno and
promote the branches to execute first. This can improve performancy by
potentially avoiding setting VL=0 which may be expensive on some uarches.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html

	PR/117974

gcc/ChangeLog:

	* config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
	(insn_increases_zeroness_p): Ditto.
	(riscv_promote_ready): Ditto.
	(riscv_sched_reorder): Implement hook.
	(TARGET_SCHED_REORDER): Define Hook.
	* config/riscv/riscv.opt: New flag.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.

Signed-off-by: Edwin Lu <ewlu@rivosinc.com>
Co-authored-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 gcc/config/riscv/riscv.cc                     | 103 ++++++++++++++++++
 gcc/config/riscv/riscv.opt                    |   4 +
 .../gcc.target/riscv/rvv/vsetvl/pr117974.c    |  15 +++
 3 files changed, 122 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c

--
2.43.0
  

Comments

Vineet Gupta Feb. 25, 2025, 12:33 a.m. UTC | #1
On 2/24/25 16:07, Edwin Lu wrote:
> See [1] thread for original patch which spawned this one.
>
> We are currently seeing the following code where we perform a vsetvl
> before a branching instruction against the avl.
>
>         vsetvli a5,a1,e32,m1,tu,ma
>         vle32.v v2,0(a0)
>         sub     a1,a1,a5 <-- a1 potentially set to 0
>         sh2add  a0,a5,a0
>         vfmacc.vv       v1,v2,v2
>         vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
>         beq     a1,zero,.L12 <-- check if avl is 0
>
> Since we are branching off of the avl, we don't need to update vl until
> after the branch is taken. Search the ready queue for vsetvls scheduled
> before branching instructions that branch off of the same regno and
> promote the branches to execute first. This can improve performancy by
> potentially avoiding setting VL=0 which may be expensive on some uarches.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>
> 	PR/117974
>
> gcc/ChangeLog:
>
> 	* config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
> 	(insn_increases_zeroness_p): Ditto.
> 	(riscv_promote_ready): Ditto.
> 	(riscv_sched_reorder): Implement hook.
> 	(TARGET_SCHED_REORDER): Define Hook.
> 	* config/riscv/riscv.opt: New flag.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
>
> Signed-off-by: Edwin Lu <ewlu@rivosinc.com>
> Co-authored-by: Palmer Dabbelt <palmer@rivosinc.com>
> ---
>  gcc/config/riscv/riscv.cc                     | 103 ++++++++++++++++++
>  gcc/config/riscv/riscv.opt                    |   4 +
>  .../gcc.target/riscv/rvv/vsetvl/pr117974.c    |  15 +++
>  3 files changed, 122 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 89aa25d5da9..cf0866fa3fb 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -14035,6 +14035,106 @@ bool need_shadow_stack_push_pop_p ()
>    return is_zicfiss_p () && riscv_save_return_addr_reg_p ();
>  }
>
> +static int
> +vsetvl_avl_regno(rtx_insn *insn)
> +{
> +  if (recog_memoized (insn) < 0)
> +    return -1;
> +
> +  if (get_attr_type (insn) != TYPE_VSETVL
> +      && get_attr_type (insn) != TYPE_VSETVL_PRE)
> +    return -1;
> +
> +  extract_insn (insn);
> +  /* From vector.md, vsetvl operands are as follows:
> +      ;; operands[0]: VL.
> +      ;; operands[1]: AVL.
> +      ;; operands[2]: SEW
> +      ;; operands[3]: LMUL
> +      ;; operands[4]: Tail policy 0 or 1 (undisturbed/agnostic)
> +      ;; operands[5]: Mask policy 0 or 1 (undisturbed/agnostic)
> +     Return regno of avl operand.  */
> +  return REGNO (recog_data.operand[1]);
> +}
> +
> +static bool
> +insn_increases_zeroness_p(rtx_insn *insn, int regno)

Mnir point, but this is just one case of introducing VL=0 zeroness specific to
branch, not the general case.

> +{
> +  /* Check for branching against zero.  */
> +  if (JUMP_P (insn))
> +    {
> +      extract_insn (insn);
> +      bool match_reg = false;
> +      bool comp_zero = false;
> +      for (int i = 0; i < recog_data.n_operands; i++)
> +	{
> +	  if (REG_P (recog_data.operand[i])
> +	      && REGNO (recog_data.operand[i]) == regno)
> +	    match_reg = true;
> +	  if (CONST_INT_P (recog_data.operand[i])
> +	      && XINT (recog_data.operand[i], 0) == 0
> +	      && XWINT (recog_data.operand[i], 0) == 0)
> +	    comp_zero = true;
> +	}
> +      return match_reg && comp_zero;
> +    }
> +  return false;
> +}
> +
> +/* Copied from MIPS.  Removes the instruction at index LOWER from ready
> +   queue READY and reinserts it in from of the instruction at index
> +   HIGHER.  LOWER must be <= HIGHER.  */
> +static void
> +riscv_promote_ready (rtx_insn **ready, int lower, int higher)
> +{
> +  rtx_insn *new_head;
> +  int i;
> +
> +  new_head = ready[lower];
> +  for (i = lower; i < higher; i++)
> +    ready[i] = ready[i + 1];
> +  ready[i] = new_head;
> +}
> +
> +/* Attempt to avoid issuing VSETVL-type instructions before a branch that
> +   ensures they are non-zero, as setting VL=0 dynamically can be slow.  */
> +static int
> +riscv_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
> +		     rtx_insn **ready, int *nreadyp, int cycle ATTRIBUTE_UNUSED)
> +{
> +    if (! TARGET_AVOID_VL_EQ_0)
> +      return riscv_issue_rate ();
> +
> +    for (int i = *nreadyp - 1; i >= 0; i--)
> +      {
> +	/* Find the vsetvl.  */
> +	int avl_regno = vsetvl_avl_regno (ready[i]);
> +	if (avl_regno == -1 || i == 0)
> +	  continue;
> +	for (int j = i - 1; j >= 0; j--)
> +	  {
> +	    /* Exit if another vsetvl is found before finding a branch insn
> +	       in the ready queue.  */
> +	    if (recog_memoized (ready[j]) >= 0
> +		&& get_attr_type (ready[j]) == TYPE_VSETVL
> +		&& get_attr_type (ready[j]) == TYPE_VSETVL_PRE)
> +	      break;
> +	    /* Find branch.  */
> +	    if (recog_memoized (ready[j]) >= 0
> +		&& insn_increases_zeroness_p (ready[j], avl_regno))
> +	      {
> +		/* Right now the only zeroness-increasing pattern we recognize
> +		   is a branch-not-zero, so there's no sense in looking for any
> +		   more zeroness at that point.  */
> +		riscv_promote_ready (ready, j, i);
> +		break;
> +	      }
> +	  }
> +      }
> +
> +    return riscv_issue_rate ();
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -14430,6 +14530,9 @@ bool need_shadow_stack_push_pop_p ()
>  #undef TARGET_DOCUMENTATION_NAME
>  #define TARGET_DOCUMENTATION_NAME "RISC-V"
>
> +#undef TARGET_SCHED_REORDER
> +#define TARGET_SCHED_REORDER riscv_sched_reorder
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-riscv.h"
> diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> index 7515c8ea13d..c6cab61fdc0 100644
> --- a/gcc/config/riscv/riscv.opt
> +++ b/gcc/config/riscv/riscv.opt
> @@ -681,3 +681,7 @@ Specifies whether the fence.tso instruction should be used.
>  mautovec-segment
>  Target Integer Var(riscv_mautovec_segment) Init(1)
>  Enable (default) or disable generation of vector segment load/store instructions.
> +
> +mavoid-vl0
> +Target Var(TARGET_AVOID_VL_EQ_0) Init(1)
> +Avoid (default) code that dynamically sets VL=0 where possible.

As stated above, this is not the general case of VL=0 avoidance but intersection
of VL=0 and a branch, so better to phrase it that way,

Also as others have said in earlier threads, this would be better as a cpu tune,
specifically part of vector tuning, maybe default for OoO tune.

Otherwise looks pretty cool  !

-Vineet

> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> new file mode 100644
> index 00000000000..275922eb0bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl -Ofast" } */
> +
> +float g(float q[], int N){
> +    float dqnorm = 0.0;
> +
> +    #pragma GCC unroll 4
> +
> +    for (int i=0; i < N; i++) {
> +        dqnorm = dqnorm + q[i] * q[i];
> +    }
> +    return dqnorm;
> +}
> +
> +/* { dg-final { scan-assembler-times {beq\s+[a-x0-9]+,zero,.L12\s+vsetvli} 3 } } */
> --
> 2.43.0
>
  
Jeff Law Feb. 25, 2025, 12:34 a.m. UTC | #2
On 2/24/25 5:07 PM, Edwin Lu wrote:
> See [1] thread for original patch which spawned this one.
> 
> We are currently seeing the following code where we perform a vsetvl
> before a branching instruction against the avl.
> 
>          vsetvli a5,a1,e32,m1,tu,ma
>          vle32.v v2,0(a0)
>          sub     a1,a1,a5 <-- a1 potentially set to 0
>          sh2add  a0,a5,a0
>          vfmacc.vv       v1,v2,v2
>          vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
>          beq     a1,zero,.L12 <-- check if avl is 0
> 
> Since we are branching off of the avl, we don't need to update vl until
> after the branch is taken. Search the ready queue for vsetvls scheduled
> before branching instructions that branch off of the same regno and
> promote the branches to execute first. This can improve performancy by
> potentially avoiding setting VL=0 which may be expensive on some uarches.
> 
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
> 
> 	PR/117974
> 
> gcc/ChangeLog:
> 
> 	* config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
> 	(insn_increases_zeroness_p): Ditto.
> 	(riscv_promote_ready): Ditto.
> 	(riscv_sched_reorder): Implement hook.
> 	(TARGET_SCHED_REORDER): Define Hook.
> 	* config/riscv/riscv.opt: New flag.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
So I preferred the earlier approach of disabling speculation of the 
vsetvls, though I'm guessing you're looking at this approach because 
that was insufficient?

Regardless, like the disabling of speculation I tend to think this 
should be controlled by an entry in the tuning structure rather than 
flags.  Flags are fine for internal testing, but I don't see a 
compelling need for a flag to control this in general.

Jeff
  
Edwin Lu Feb. 25, 2025, 1:10 a.m. UTC | #3
On 2/24/2025 4:34 PM, Jeff Law wrote:
>
>
> On 2/24/25 5:07 PM, Edwin Lu wrote:
>> See [1] thread for original patch which spawned this one.
>>
>> We are currently seeing the following code where we perform a vsetvl
>> before a branching instruction against the avl.
>>
>>          vsetvli a5,a1,e32,m1,tu,ma
>>          vle32.v v2,0(a0)
>>          sub     a1,a1,a5 <-- a1 potentially set to 0
>>          sh2add  a0,a5,a0
>>          vfmacc.vv       v1,v2,v2
>>          vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl 
>> to 0
>>          beq     a1,zero,.L12 <-- check if avl is 0
>>
>> Since we are branching off of the avl, we don't need to update vl until
>> after the branch is taken. Search the ready queue for vsetvls scheduled
>> before branching instructions that branch off of the same regno and
>> promote the branches to execute first. This can improve performancy by
>> potentially avoiding setting VL=0 which may be expensive on some 
>> uarches.
>>
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>>
>>     PR/117974
>>
>> gcc/ChangeLog:
>>
>>     * config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
>>     (insn_increases_zeroness_p): Ditto.
>>     (riscv_promote_ready): Ditto.
>>     (riscv_sched_reorder): Implement hook.
>>     (TARGET_SCHED_REORDER): Define Hook.
>>     * config/riscv/riscv.opt: New flag.
>>
>> gcc/testsuite/ChangeLog:
>>
>>     * gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
> So I preferred the earlier approach of disabling speculation of the 
> vsetvls, though I'm guessing you're looking at this approach because 
> that was insufficient?
I don't think that it was because it was insufficient, but that it might 
be too constraining. I'm not exactly certain if there is the possibility 
where speculatively issuing some vsetvls in some situations could 
improve perf. I think with this patch, it targets the specific issue 
we're facing directly which is updating vl=0 when it's unnecessary. I 
don't know which patch would be better so I'm putting this out there as 
a potential alternative.
> Regardless, like the disabling of speculation I tend to think this 
> should be controlled by an entry in the tuning structure rather than 
> flags. Flags are fine for internal testing, but I don't see a 
> compelling need for a flag to control this in general.
>
> Jeff

That makes sense to me. I completely forgot that we had talked about 
putting it into the tuning structure when writing this. I'll update the 
tunes respectively in a later version of the patches depending on which 
one moves forward.

Edwin
  
Jeff Law Feb. 25, 2025, 2:39 a.m. UTC | #4
On 2/24/25 6:10 PM, Edwin Lu wrote:

>> So I preferred the earlier approach of disabling speculation of the 
>> vsetvls, though I'm guessing you're looking at this approach because 
>> that was insufficient?
> I don't think that it was because it was insufficient, but that it might 
> be too constraining. I'm not exactly certain if there is the possibility 
> where speculatively issuing some vsetvls in some situations could 
> improve perf.
On an out of order core I would expect the hardware to do the 
speculation and doing it in the compiler is of marginal value, if any.



  I think with this patch, it targets the specific issue
> we're facing directly which is updating vl=0 when it's unnecessary. I 
> don't know which patch would be better so I'm putting this out there as 
> a potential alternative.
I suspect (but haven't verified) that the only time you'll see the 
conditional branch and the vsetvl in the ready queue together is in the 
speculative case.  Now it may be the case that you can be more targeted 
in inhibiting speculation, but again, for an out of order core I don't 
think it's actually going to matter.

Essentially when we aren't speculating the conditional branch will be 
dependent on all the instructions in the block.  So the conditional 
branch won't go into the ready queue until after everything else has issued.

Given we're not planning to move forward until stage1 reopens, you've 
got time to play with it internally. I've also got a mental todo to 
check with our designers to see if we have any sensitivity to this issue.

jeff
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 89aa25d5da9..cf0866fa3fb 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -14035,6 +14035,106 @@  bool need_shadow_stack_push_pop_p ()
   return is_zicfiss_p () && riscv_save_return_addr_reg_p ();
 }

+static int
+vsetvl_avl_regno(rtx_insn *insn)
+{
+  if (recog_memoized (insn) < 0)
+    return -1;
+
+  if (get_attr_type (insn) != TYPE_VSETVL
+      && get_attr_type (insn) != TYPE_VSETVL_PRE)
+    return -1;
+
+  extract_insn (insn);
+  /* From vector.md, vsetvl operands are as follows:
+      ;; operands[0]: VL.
+      ;; operands[1]: AVL.
+      ;; operands[2]: SEW
+      ;; operands[3]: LMUL
+      ;; operands[4]: Tail policy 0 or 1 (undisturbed/agnostic)
+      ;; operands[5]: Mask policy 0 or 1 (undisturbed/agnostic)
+     Return regno of avl operand.  */
+  return REGNO (recog_data.operand[1]);
+}
+
+static bool
+insn_increases_zeroness_p(rtx_insn *insn, int regno)
+{
+  /* Check for branching against zero.  */
+  if (JUMP_P (insn))
+    {
+      extract_insn (insn);
+      bool match_reg = false;
+      bool comp_zero = false;
+      for (int i = 0; i < recog_data.n_operands; i++)
+	{
+	  if (REG_P (recog_data.operand[i])
+	      && REGNO (recog_data.operand[i]) == regno)
+	    match_reg = true;
+	  if (CONST_INT_P (recog_data.operand[i])
+	      && XINT (recog_data.operand[i], 0) == 0
+	      && XWINT (recog_data.operand[i], 0) == 0)
+	    comp_zero = true;
+	}
+      return match_reg && comp_zero;
+    }
+  return false;
+}
+
+/* Copied from MIPS.  Removes the instruction at index LOWER from ready
+   queue READY and reinserts it in from of the instruction at index
+   HIGHER.  LOWER must be <= HIGHER.  */
+static void
+riscv_promote_ready (rtx_insn **ready, int lower, int higher)
+{
+  rtx_insn *new_head;
+  int i;
+
+  new_head = ready[lower];
+  for (i = lower; i < higher; i++)
+    ready[i] = ready[i + 1];
+  ready[i] = new_head;
+}
+
+/* Attempt to avoid issuing VSETVL-type instructions before a branch that
+   ensures they are non-zero, as setting VL=0 dynamically can be slow.  */
+static int
+riscv_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
+		     rtx_insn **ready, int *nreadyp, int cycle ATTRIBUTE_UNUSED)
+{
+    if (! TARGET_AVOID_VL_EQ_0)
+      return riscv_issue_rate ();
+
+    for (int i = *nreadyp - 1; i >= 0; i--)
+      {
+	/* Find the vsetvl.  */
+	int avl_regno = vsetvl_avl_regno (ready[i]);
+	if (avl_regno == -1 || i == 0)
+	  continue;
+	for (int j = i - 1; j >= 0; j--)
+	  {
+	    /* Exit if another vsetvl is found before finding a branch insn
+	       in the ready queue.  */
+	    if (recog_memoized (ready[j]) >= 0
+		&& get_attr_type (ready[j]) == TYPE_VSETVL
+		&& get_attr_type (ready[j]) == TYPE_VSETVL_PRE)
+	      break;
+	    /* Find branch.  */
+	    if (recog_memoized (ready[j]) >= 0
+		&& insn_increases_zeroness_p (ready[j], avl_regno))
+	      {
+		/* Right now the only zeroness-increasing pattern we recognize
+		   is a branch-not-zero, so there's no sense in looking for any
+		   more zeroness at that point.  */
+		riscv_promote_ready (ready, j, i);
+		break;
+	      }
+	  }
+      }
+
+    return riscv_issue_rate ();
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -14430,6 +14530,9 @@  bool need_shadow_stack_push_pop_p ()
 #undef TARGET_DOCUMENTATION_NAME
 #define TARGET_DOCUMENTATION_NAME "RISC-V"

+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER riscv_sched_reorder
+
 struct gcc_target targetm = TARGET_INITIALIZER;

 #include "gt-riscv.h"
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 7515c8ea13d..c6cab61fdc0 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -681,3 +681,7 @@  Specifies whether the fence.tso instruction should be used.
 mautovec-segment
 Target Integer Var(riscv_mautovec_segment) Init(1)
 Enable (default) or disable generation of vector segment load/store instructions.
+
+mavoid-vl0
+Target Var(TARGET_AVOID_VL_EQ_0) Init(1)
+Avoid (default) code that dynamically sets VL=0 where possible.
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
new file mode 100644
index 00000000000..275922eb0bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl -Ofast" } */
+
+float g(float q[], int N){
+    float dqnorm = 0.0;
+
+    #pragma GCC unroll 4
+
+    for (int i=0; i < N; i++) {
+        dqnorm = dqnorm + q[i] * q[i];
+    }
+    return dqnorm;
+}
+
+/* { dg-final { scan-assembler-times {beq\s+[a-x0-9]+,zero,.L12\s+vsetvli} 3 } } */