RISC-V: Fix PR108279

Message ID 20230327065907.155807-1-juzhe.zhong@rivai.ai
State Committed
Headers
Series RISC-V: Fix PR108279 |

Commit Message

juzhe.zhong@rivai.ai March 27, 2023, 6:59 a.m. UTC
  From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

        PR 108270

Fix bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.

Consider the following testcase:
void f (void * restrict in, void * restrict out, int l, int n, int m)
{
  for (int i = 0; i < l; i++){
    for (int j = 0; j < m; j++){
      for (int k = 0; k < n; k++)
        {
          vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
          __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
        }
    }
  }
}

Compile option: -O3

Before this patch:
	mv	a7,a2
	mv	a6,a0	
        mv	t1,a1
	mv	a2,a3
	vsetivli	zero,17,e8,mf8,ta,ma
...

After this patch:
        mv      a7,a2
        mv      a6,a0
        mv      t1,a1
        mv      a2,a3
        ble     a7,zero,.L1
        ble     a4,zero,.L1
        ble     a3,zero,.L1
        add     a1,a0,a4
        li      a0,0
        vsetivli        zero,17,e8,mf8,ta,ma
...

It will produce potential bug when:

int main ()
{
  vsetivli zero, 100,.....
  f (in, out, 0,0,0)
  asm volatile ("csrr a0,vl":::"memory");

  // Before this patch the a0 is 17. (Wrong).
  // After this patch the a0 is 100. (Correct).
  ...
}

gcc/ChangeLog:

        * config/riscv/riscv-vsetvl.cc (vector_infos_manager::all_empty_predecessor_p): New function.
        (pass_vsetvl::backward_demand_fusion): Fix bug.
        * config/riscv/riscv-vsetvl.h: New function declare.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c: Adapt test.
        * gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c: Adapt test.
        * gcc.target/riscv/rvv/vsetvl/pr108270.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc              | 24 +++++++++++++++++++
 gcc/config/riscv/riscv-vsetvl.h               |  2 ++
 .../riscv/rvv/vsetvl/imm_bb_prop-1.c          |  2 +-
 .../riscv/rvv/vsetvl/imm_conflict-3.c         |  4 ++--
 .../gcc.target/riscv/rvv/vsetvl/pr108270.c    | 19 +++++++++++++++
 5 files changed, 48 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
  

Comments

Jeff Law April 2, 2023, 7:41 p.m. UTC | #1
On 3/27/23 00:59, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
>          PR 108270
> 
> Fix bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.
> 
> Consider the following testcase:
> void f (void * restrict in, void * restrict out, int l, int n, int m)
> {
>    for (int i = 0; i < l; i++){
>      for (int j = 0; j < m; j++){
>        for (int k = 0; k < n; k++)
>          {
>            vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
>            __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
>          }
>      }
>    }
> }
> 
> Compile option: -O3
> 
> Before this patch:
> 	mv	a7,a2
> 	mv	a6,a0	
>          mv	t1,a1
> 	mv	a2,a3
> 	vsetivli	zero,17,e8,mf8,ta,ma
> ...
> 
> After this patch:
>          mv      a7,a2
>          mv      a6,a0
>          mv      t1,a1
>          mv      a2,a3
>          ble     a7,zero,.L1
>          ble     a4,zero,.L1
>          ble     a3,zero,.L1
>          add     a1,a0,a4
>          li      a0,0
>          vsetivli        zero,17,e8,mf8,ta,ma
> ...
> 
> It will produce potential bug when:
> 
> int main ()
> {
>    vsetivli zero, 100,.....
>    f (in, out, 0,0,0)
>    asm volatile ("csrr a0,vl":::"memory");
> 
>    // Before this patch the a0 is 17. (Wrong).
>    // After this patch the a0 is 100. (Correct).
>    ...
> }
So why was that point selected in the first place?   I would have 
expected LCM to select the loop entry edge as the desired insertion point.

Essentially if LCM selects the point before those branches, then it's 
voilating a fundamental principal of LCM, namely that you never put an 
evaluation on a path where it didn't have one before.

So not objecting to the patch but it is raising concerns about the LCM 
results.

jeff
  
juzhe.zhong@rivai.ai April 2, 2023, 10:40 p.m. UTC | #2
This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand info backward fusion and propogation) which
is I introduced into VSETVL PASS to enhance LCM && improve vsetvl instruction performance.

This patch is to supress the Phase 3 too aggressive backward fusion and propagation to the top of the function program
when there is no define instruction of AVL (AVL is 0 ~ 31 imm since vsetivli instruction allows imm value instead of reg).

You may want to ask why we need Phase 3 to the job. 
Well, we have so many situations that pure LCM fails to optimize, here I can show you a simple case to demonstrate it:
void f (void * restrict in, void * restrict out, int n, int m, int cond)
{
  size_t vl = 101;
  for (size_t j = 0; j < m; j++){
    if (cond) {
      for (size_t i = 0; i < n; i++)
        {
          vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, vl);
          __riscv_vse8_v_i8mf8 (out + i, v, vl);
        }
    } else {
      for (size_t i = 0; i < n; i++)
        {
          vint32mf2_t v = __riscv_vle32_v_i32mf2 (in + i + j, vl);
          v = __riscv_vadd_vv_i32mf2 (v,v,vl);
          __riscv_vse32_v_i32mf2 (out + i, v, vl);
        }
    }
  }
}

You can see:
The first inner loop needs vsetvli e8 mf8 for vle+vse.
The second inner loop need vsetvli e32 mf2 for vle+vadd+vse.

If we don't have Phase 3 (Only handled by LCM (Phase 4)), we will end up with :

outerloop:
...
vsetvli e8mf8
inner loop 1:
....

vsetvli e32mf2
inner loop 2:
....

However, if we have Phase 3, Phase 3 is going to fuse the vsetvli e32 mf2 of inner loop 2 into vsetvli e8 mf8, then we will end up with this result after phase 3:

outerloop:
...
inner loop 1:
vsetvli e32mf2
....

inner loop 2:
vsetvli e32mf2
....

Then, this demand information after phase 3 will be well optimized after phase 4 (LCM), after Phase 4 result is:

vsetvli e32mf2
outerloop:
...
inner loop 1:
....

inner loop 2:
....

You can see this is the optimal codegen after current VSETVL PASS (Phase 3: Demand backward fusion and propagation + Phase 4: LCM ). This is a known issue when I start to implement VSETVL PASS.
I leaved it to be fixed after I finished all target GCC 13 features. And Kito postpone this patch to be merged after GCC 14 is open.



juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-04-03 03:41
To: juzhe.zhong; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Fix PR108279
 
 
On 3/27/23 00:59, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
>          PR 108270
> 
> Fix bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.
> 
> Consider the following testcase:
> void f (void * restrict in, void * restrict out, int l, int n, int m)
> {
>    for (int i = 0; i < l; i++){
>      for (int j = 0; j < m; j++){
>        for (int k = 0; k < n; k++)
>          {
>            vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
>            __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
>          }
>      }
>    }
> }
> 
> Compile option: -O3
> 
> Before this patch:
> mv a7,a2
> mv a6,a0 
>          mv t1,a1
> mv a2,a3
> vsetivli zero,17,e8,mf8,ta,ma
> ...
> 
> After this patch:
>          mv      a7,a2
>          mv      a6,a0
>          mv      t1,a1
>          mv      a2,a3
>          ble     a7,zero,.L1
>          ble     a4,zero,.L1
>          ble     a3,zero,.L1
>          add     a1,a0,a4
>          li      a0,0
>          vsetivli        zero,17,e8,mf8,ta,ma
> ...
> 
> It will produce potential bug when:
> 
> int main ()
> {
>    vsetivli zero, 100,.....
>    f (in, out, 0,0,0)
>    asm volatile ("csrr a0,vl":::"memory");
> 
>    // Before this patch the a0 is 17. (Wrong).
>    // After this patch the a0 is 100. (Correct).
>    ...
> }
So why was that point selected in the first place?   I would have 
expected LCM to select the loop entry edge as the desired insertion point.
 
Essentially if LCM selects the point before those branches, then it's 
voilating a fundamental principal of LCM, namely that you never put an 
evaluation on a path where it didn't have one before.
 
So not objecting to the patch but it is raising concerns about the LCM 
results.
 
jeff
  
Jeff Law April 5, 2023, 1:05 p.m. UTC | #3
On 4/2/23 16:40, juzhe.zhong@rivai.ai wrote:
> This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand 
> info backward fusion and propogation) which
> is I introduced into VSETVL PASS to enhance LCM && improve vsetvl 
> instruction performance.
So fusion in this context is really about identifying cases where two 
configuration settings are equivalent and you "fuse" them together. 
Presumably this is only going to be possible when the vector insns are 
just doing data movement rather than actual computations?

If my understanding is correct, I can kind of see why you're doing 
fusion during phase 3.  My sense is there's a better way, but I'm having 
a bit of trouble working out the details of what that should be to 
myself.  In any event, revamping parts of the vsetvl insertion code 
isn't the kind of thing we should be doing now.


WRT the actual patch.  Please put a function comment on the 
all_empty_predecessor_p method. Something like this perhaps?

/* Return TRUE if all the predecessors of CFG_BB have vsetvl
    state that is valid or dirty, FALSE otherwise.  */


That would seem to indicate the function is poorly named.  Unless you're 
using "empty" here to mean the state is valid or dirty.  Either way it 
seems like the function name ought to be improved.

The comments talk about bb1 being inside a loop.  Nowhere do you check 
that as far as I can tell.

When trying to understand what the patch is going I ran across this comment:

  /* The local_dem vector insn_info of the block.  */
   vector_insn_info local_dem;


That comment really doesn't improve anything.  "local_dem" is clearly 
short-hand for something (local demand?), whatever it is, make it 
clearer in the comment.

Jeff
  
juzhe.zhong@rivai.ai April 5, 2023, 1:53 p.m. UTC | #4
>> So fusion in this context is really about identifying cases where two
>> configuration settings are equivalent and you "fuse" them together.
>> Presumably this is only going to be possible when the vector insns are
>> just doing data movement rather than actual computations?

>> If my understanding is correct, I can kind of see why you're doing
>> fusion during phase 3.  My sense is there's a better way, but I'm having
>> a bit of trouble working out the details of what that should be to
>> myself.  In any event, revamping parts of the vsetvl insertion code
>> isn't the kind of thing we should be doing now.

The vsetvl demand fusion happens is not necessary "equivalent", instead, we
call it we will do demand fusion when they are "compatible".
And the fusion can happen between any vector insns including data movement
and actual computations.

What is "compatible" ??  This definition is according to RVV ISA.
For example , For a vadd.vv need a vsetvl zero, 4, e32,m1,ta,ma.
and a vle.v need a vsetvl zero,4,e8,mf4,ta,ma.

According to RVV ISA:
vadd.vv demand SEW = 32, LMUL = M1, AVL = 4
vle.v demand RATIO = SEW/LMUL = 32, AVL = 4.
So after demand fusion, the demand becomes SEW = 32, LMUL = M1, AVL = 4.
Such vsetvl instruction is configured as this demand fusion, we call it "compatible"
since we can find a common vsetvl VL/VTYPE status for both vadd.vv and vle.v

However, what case is not "incompatible", same example, if the vadd.vv demand SEW = 32. LMUL = MF2,
the vadd.vv is incompatible with vle.v. since we can't find a common VL/VTYPE vsetvl instruction available
for both of them.

We have local demand fusion which is Phase 1. Local demand fusion is doing the fusion within a block
And also we have global demand fusion which is Phase 3. Global demand fusion is doing across blocks.

After Phase 1, each block has a single demand fusion. Phase 3 is doing global demand fusion trying to
find the common VL/VTYPE status available for a bunch of blocks, and fuse them into a single vsetvl.
So that we eliminate redundant vsetvli.

Here is a example:
                           
                                    bb 0:  (vle.v demand RATIO = 32)
                                  /       \
                            bb 1      bb 2
                          /      \     /       \
                 bb 3       bb 4  ....     bb 5
               vadd       vmul          vdiv
            (demand  (demand      (demand 
             sew = 8,    sew = 8,      sew = 8, 
        lmul = mf4)  lmul = mf4,   lmul = mf4,
                          tail policy = tu) mask policy = mu)

So in this case, we should do the global demand fusion for bb 0, bb3, bb 4, bb5.
since they are compatible according to RVV ISA.
The final demand info of vsetvl should be vsetvl e8,mf4,tu,mu and put it 
in the bb0. Then we can avoid vsetvl in bb 3, 4, 5.

>> We have more fusion rules according to RVV ISA. Phase 3 (Global demand fusion) is 
>> really important. 

>> That would seem to indicate the function is poorly named.  Unless you're
>> using "empty" here to mean the state is valid or dirty.  Either way it
>> seems like the function name ought to be improved.

>> The comments talk about bb1 being inside a loop.  Nowhere do you check
>> that as far as I can tell.

>> When trying to understand what the patch is going I ran across this comment:

>>   /* The local_dem vector insn_info of the block.  */
 >>   vector_insn_info local_dem;


>> That comment really doesn't improve anything.  "local_dem" is clearly
>> short-hand for something (local demand?), whatever it is, make it
>> clearer in the comment.

Sorry for bad comments in the codes. Currently, I am working on the first patch
of auto-vectorization. After I sent the first patch of auto-vectorization for you to
review. I would like to re-check all the comments and code style of VSETVL PASS.
And refine them.




juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-04-05 21:05
To: juzhe.zhong; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Fix PR108279
 
 
On 4/2/23 16:40, juzhe.zhong@rivai.ai wrote:
> This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand 
> info backward fusion and propogation) which
> is I introduced into VSETVL PASS to enhance LCM && improve vsetvl 
> instruction performance.
So fusion in this context is really about identifying cases where two 
configuration settings are equivalent and you "fuse" them together. 
Presumably this is only going to be possible when the vector insns are 
just doing data movement rather than actual computations?
 
If my understanding is correct, I can kind of see why you're doing 
fusion during phase 3.  My sense is there's a better way, but I'm having 
a bit of trouble working out the details of what that should be to 
myself.  In any event, revamping parts of the vsetvl insertion code 
isn't the kind of thing we should be doing now.
 
 
WRT the actual patch.  Please put a function comment on the 
all_empty_predecessor_p method. Something like this perhaps?
 
/* Return TRUE if all the predecessors of CFG_BB have vsetvl
    state that is valid or dirty, FALSE otherwise.  */
 
 
That would seem to indicate the function is poorly named.  Unless you're 
using "empty" here to mean the state is valid or dirty.  Either way it 
seems like the function name ought to be improved.
 
The comments talk about bb1 being inside a loop.  Nowhere do you check 
that as far as I can tell.
 
When trying to understand what the patch is going I ran across this comment:
 
  /* The local_dem vector insn_info of the block.  */
   vector_insn_info local_dem;
 
 
That comment really doesn't improve anything.  "local_dem" is clearly 
short-hand for something (local demand?), whatever it is, make it 
clearer in the comment.
 
Jeff
  
Richard Biener April 11, 2023, 8:55 a.m. UTC | #5
On Wed, Apr 5, 2023 at 3:53 PM <juzhe.zhong@rivai.ai> wrote:
>
> >> So fusion in this context is really about identifying cases where two
> >> configuration settings are equivalent and you "fuse" them together.
> >> Presumably this is only going to be possible when the vector insns are
> >> just doing data movement rather than actual computations?
>
> >> If my understanding is correct, I can kind of see why you're doing
> >> fusion during phase 3.  My sense is there's a better way, but I'm having
> >> a bit of trouble working out the details of what that should be to
> >> myself.  In any event, revamping parts of the vsetvl insertion code
> >> isn't the kind of thing we should be doing now.
>
> The vsetvl demand fusion happens is not necessary "equivalent", instead, we
> call it we will do demand fusion when they are "compatible".
> And the fusion can happen between any vector insns including data movement
> and actual computations.
>
> What is "compatible" ??  This definition is according to RVV ISA.
> For example , For a vadd.vv need a vsetvl zero, 4, e32,m1,ta,ma.
> and a vle.v need a vsetvl zero,4,e8,mf4,ta,ma.
>
> According to RVV ISA:
> vadd.vv demand SEW = 32, LMUL = M1, AVL = 4
> vle.v demand RATIO = SEW/LMUL = 32, AVL = 4.
> So after demand fusion, the demand becomes SEW = 32, LMUL = M1, AVL = 4.
> Such vsetvl instruction is configured as this demand fusion, we call it "compatible"
> since we can find a common vsetvl VL/VTYPE status for both vadd.vv and vle.v
>
> However, what case is not "incompatible", same example, if the vadd.vv demand SEW = 32. LMUL = MF2,
> the vadd.vv is incompatible with vle.v. since we can't find a common VL/VTYPE vsetvl instruction available
> for both of them.
>
> We have local demand fusion which is Phase 1. Local demand fusion is doing the fusion within a block
> And also we have global demand fusion which is Phase 3. Global demand fusion is doing across blocks.
>
> After Phase 1, each block has a single demand fusion. Phase 3 is doing global demand fusion trying to
> find the common VL/VTYPE status available for a bunch of blocks, and fuse them into a single vsetvl.
> So that we eliminate redundant vsetvli.
>
> Here is a example:
>
>                                     bb 0:  (vle.v demand RATIO = 32)
>                                   /       \
>                             bb 1      bb 2
>                           /      \     /       \
>                  bb 3       bb 4  ....     bb 5
>                vadd       vmul          vdiv
>             (demand  (demand      (demand
>              sew = 8,    sew = 8,      sew = 8,
>         lmul = mf4)  lmul = mf4,   lmul = mf4,
>                           tail policy = tu) mask policy = mu)
>
> So in this case, we should do the global demand fusion for bb 0, bb3, bb 4, bb5.
> since they are compatible according to RVV ISA.
> The final demand info of vsetvl should be vsetvl e8,mf4,tu,mu and put it
> in the bb0. Then we can avoid vsetvl in bb 3, 4, 5.

Just to throw in a comment here - I think you should present LCM with
something it
can identify as the same for compatible vsetvl and then it should just
work?  OTOH
if "compatible" is not transitive that's not possible (but then I
can't quickly make up
an example where it wouldn't be).

> >> We have more fusion rules according to RVV ISA. Phase 3 (Global demand fusion) is
> >> really important.
>
> >> That would seem to indicate the function is poorly named.  Unless you're
> >> using "empty" here to mean the state is valid or dirty.  Either way it
> >> seems like the function name ought to be improved.
>
> >> The comments talk about bb1 being inside a loop.  Nowhere do you check
> >> that as far as I can tell.
>
> >> When trying to understand what the patch is going I ran across this comment:
>
> >>   /* The local_dem vector insn_info of the block.  */
>  >>   vector_insn_info local_dem;
>
>
> >> That comment really doesn't improve anything.  "local_dem" is clearly
> >> short-hand for something (local demand?), whatever it is, make it
> >> clearer in the comment.
>
> Sorry for bad comments in the codes. Currently, I am working on the first patch
> of auto-vectorization. After I sent the first patch of auto-vectorization for you to
> review. I would like to re-check all the comments and code style of VSETVL PASS.
> And refine them.
>
>
>
>
> juzhe.zhong@rivai.ai
>
> From: Jeff Law
> Date: 2023-04-05 21:05
> To: juzhe.zhong; gcc-patches
> CC: kito.cheng; palmer
> Subject: Re: [PATCH] RISC-V: Fix PR108279
>
>
> On 4/2/23 16:40, juzhe.zhong@rivai.ai wrote:
> > This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand
> > info backward fusion and propogation) which
> > is I introduced into VSETVL PASS to enhance LCM && improve vsetvl
> > instruction performance.
> So fusion in this context is really about identifying cases where two
> configuration settings are equivalent and you "fuse" them together.
> Presumably this is only going to be possible when the vector insns are
> just doing data movement rather than actual computations?
>
> If my understanding is correct, I can kind of see why you're doing
> fusion during phase 3.  My sense is there's a better way, but I'm having
> a bit of trouble working out the details of what that should be to
> myself.  In any event, revamping parts of the vsetvl insertion code
> isn't the kind of thing we should be doing now.
>
>
> WRT the actual patch.  Please put a function comment on the
> all_empty_predecessor_p method. Something like this perhaps?
>
> /* Return TRUE if all the predecessors of CFG_BB have vsetvl
>     state that is valid or dirty, FALSE otherwise.  */
>
>
> That would seem to indicate the function is poorly named.  Unless you're
> using "empty" here to mean the state is valid or dirty.  Either way it
> seems like the function name ought to be improved.
>
> The comments talk about bb1 being inside a loop.  Nowhere do you check
> that as far as I can tell.
>
> When trying to understand what the patch is going I ran across this comment:
>
>   /* The local_dem vector insn_info of the block.  */
>    vector_insn_info local_dem;
>
>
> That comment really doesn't improve anything.  "local_dem" is clearly
> short-hand for something (local demand?), whatever it is, make it
> clearer in the comment.
>
> Jeff
>
  
juzhe.zhong@rivai.ai April 11, 2023, 9:18 a.m. UTC | #6
No, we can only pass "available" to LCM.
Passing "compatible" to LCM can not work for us.

LCM can only help for eliminate vsetvls can not help for fuse vsetvls.

For example:

bb 0:
vsetvl e8,mf8
vadd (Demand SEW = 8, LMUL = MF8)
bb 1:
vsetvl e32 mf2
vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1])

I use LCM to handle the case above, I tell LCM that the vsetvl of vadd is "available" for the following "vle" instruction.
Then LCM will let us to remove "vsetvl e32mf2"
This is what I said "available" case that I use LCM to handle.

However, LCM can not handle "compatible" case. Here is the example:

Loop:
{
bb 0:
vsetvl e8,mf8,TA
vadd (Demand SEW = 8, LMUL = MF8, can either TU or TA)
bb 1:
vsetvl e32 mf2,TU
vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1], and demand TU)
}
It's obvious that neither "vsetvl e8,mf8,TA" nor "vsetvl e32 mf2,TU" are available for both instructions "vadd" and "vle".
That's why we need Phase 3 in VSETVL PASS.
I do the demand fusion generate a new vsetvl instructions "vsetvl e8,mf8,TU" which is available for both RVV instructions "vadd" and "vle", 
and update the first vsetvl "vsetvl e8,mf8,TA" to  "vsetvl e8,mf8,TU"

Then, I tell LCM "vsetvl e8,mf8,TU" is available for both "vadd" and "vle32", so LCM will hoist "vsetvl e8,mf8,TU" outside the LOOP
and remove all vsetvls inside the loop.



juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-04-11 16:55
To: juzhe.zhong
CC: Jeff Law; gcc-patches; kito.cheng; palmer
Subject: Re: Re: [PATCH] RISC-V: Fix PR108279
On Wed, Apr 5, 2023 at 3:53 PM <juzhe.zhong@rivai.ai> wrote:
>
> >> So fusion in this context is really about identifying cases where two
> >> configuration settings are equivalent and you "fuse" them together.
> >> Presumably this is only going to be possible when the vector insns are
> >> just doing data movement rather than actual computations?
>
> >> If my understanding is correct, I can kind of see why you're doing
> >> fusion during phase 3.  My sense is there's a better way, but I'm having
> >> a bit of trouble working out the details of what that should be to
> >> myself.  In any event, revamping parts of the vsetvl insertion code
> >> isn't the kind of thing we should be doing now.
>
> The vsetvl demand fusion happens is not necessary "equivalent", instead, we
> call it we will do demand fusion when they are "compatible".
> And the fusion can happen between any vector insns including data movement
> and actual computations.
>
> What is "compatible" ??  This definition is according to RVV ISA.
> For example , For a vadd.vv need a vsetvl zero, 4, e32,m1,ta,ma.
> and a vle.v need a vsetvl zero,4,e8,mf4,ta,ma.
>
> According to RVV ISA:
> vadd.vv demand SEW = 32, LMUL = M1, AVL = 4
> vle.v demand RATIO = SEW/LMUL = 32, AVL = 4.
> So after demand fusion, the demand becomes SEW = 32, LMUL = M1, AVL = 4.
> Such vsetvl instruction is configured as this demand fusion, we call it "compatible"
> since we can find a common vsetvl VL/VTYPE status for both vadd.vv and vle.v
>
> However, what case is not "incompatible", same example, if the vadd.vv demand SEW = 32. LMUL = MF2,
> the vadd.vv is incompatible with vle.v. since we can't find a common VL/VTYPE vsetvl instruction available
> for both of them.
>
> We have local demand fusion which is Phase 1. Local demand fusion is doing the fusion within a block
> And also we have global demand fusion which is Phase 3. Global demand fusion is doing across blocks.
>
> After Phase 1, each block has a single demand fusion. Phase 3 is doing global demand fusion trying to
> find the common VL/VTYPE status available for a bunch of blocks, and fuse them into a single vsetvl.
> So that we eliminate redundant vsetvli.
>
> Here is a example:
>
>                                     bb 0:  (vle.v demand RATIO = 32)
>                                   /       \
>                             bb 1      bb 2
>                           /      \     /       \
>                  bb 3       bb 4  ....     bb 5
>                vadd       vmul          vdiv
>             (demand  (demand      (demand
>              sew = 8,    sew = 8,      sew = 8,
>         lmul = mf4)  lmul = mf4,   lmul = mf4,
>                           tail policy = tu) mask policy = mu)
>
> So in this case, we should do the global demand fusion for bb 0, bb3, bb 4, bb5.
> since they are compatible according to RVV ISA.
> The final demand info of vsetvl should be vsetvl e8,mf4,tu,mu and put it
> in the bb0. Then we can avoid vsetvl in bb 3, 4, 5.
 
Just to throw in a comment here - I think you should present LCM with
something it
can identify as the same for compatible vsetvl and then it should just
work?  OTOH
if "compatible" is not transitive that's not possible (but then I
can't quickly make up
an example where it wouldn't be).
 
> >> We have more fusion rules according to RVV ISA. Phase 3 (Global demand fusion) is
> >> really important.
>
> >> That would seem to indicate the function is poorly named.  Unless you're
> >> using "empty" here to mean the state is valid or dirty.  Either way it
> >> seems like the function name ought to be improved.
>
> >> The comments talk about bb1 being inside a loop.  Nowhere do you check
> >> that as far as I can tell.
>
> >> When trying to understand what the patch is going I ran across this comment:
>
> >>   /* The local_dem vector insn_info of the block.  */
>  >>   vector_insn_info local_dem;
>
>
> >> That comment really doesn't improve anything.  "local_dem" is clearly
> >> short-hand for something (local demand?), whatever it is, make it
> >> clearer in the comment.
>
> Sorry for bad comments in the codes. Currently, I am working on the first patch
> of auto-vectorization. After I sent the first patch of auto-vectorization for you to
> review. I would like to re-check all the comments and code style of VSETVL PASS.
> And refine them.
>
>
>
>
> juzhe.zhong@rivai.ai
>
> From: Jeff Law
> Date: 2023-04-05 21:05
> To: juzhe.zhong; gcc-patches
> CC: kito.cheng; palmer
> Subject: Re: [PATCH] RISC-V: Fix PR108279
>
>
> On 4/2/23 16:40, juzhe.zhong@rivai.ai wrote:
> > This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand
> > info backward fusion and propogation) which
> > is I introduced into VSETVL PASS to enhance LCM && improve vsetvl
> > instruction performance.
> So fusion in this context is really about identifying cases where two
> configuration settings are equivalent and you "fuse" them together.
> Presumably this is only going to be possible when the vector insns are
> just doing data movement rather than actual computations?
>
> If my understanding is correct, I can kind of see why you're doing
> fusion during phase 3.  My sense is there's a better way, but I'm having
> a bit of trouble working out the details of what that should be to
> myself.  In any event, revamping parts of the vsetvl insertion code
> isn't the kind of thing we should be doing now.
>
>
> WRT the actual patch.  Please put a function comment on the
> all_empty_predecessor_p method. Something like this perhaps?
>
> /* Return TRUE if all the predecessors of CFG_BB have vsetvl
>     state that is valid or dirty, FALSE otherwise.  */
>
>
> That would seem to indicate the function is poorly named.  Unless you're
> using "empty" here to mean the state is valid or dirty.  Either way it
> seems like the function name ought to be improved.
>
> The comments talk about bb1 being inside a loop.  Nowhere do you check
> that as far as I can tell.
>
> When trying to understand what the patch is going I ran across this comment:
>
>   /* The local_dem vector insn_info of the block.  */
>    vector_insn_info local_dem;
>
>
> That comment really doesn't improve anything.  "local_dem" is clearly
> short-hand for something (local demand?), whatever it is, make it
> clearer in the comment.
>
> Jeff
>
  
Richard Biener April 11, 2023, 11:19 a.m. UTC | #7
On Tue, Apr 11, 2023 at 11:19 AM juzhe.zhong@rivai.ai
<juzhe.zhong@rivai.ai> wrote:
>
> No, we can only pass "available" to LCM.
> Passing "compatible" to LCM can not work for us.
>
> LCM can only help for eliminate vsetvls can not help for fuse vsetvls.
>
> For example:
>
> bb 0:
> vsetvl e8,mf8
> vadd (Demand SEW = 8, LMUL = MF8)
> bb 1:
> vsetvl e32 mf2
> vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1])
>
> I use LCM to handle the case above, I tell LCM that the vsetvl of vadd is "available" for the following "vle" instruction.
> Then LCM will let us to remove "vsetvl e32mf2"
> This is what I said "available" case that I use LCM to handle.
>
> However, LCM can not handle "compatible" case. Here is the example:
>
> Loop:
> {
> bb 0:
> vsetvl e8,mf8,TA
> vadd (Demand SEW = 8, LMUL = MF8, can either TU or TA)
> bb 1:
> vsetvl e32 mf2,TU
> vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1], and demand TU)
> }

So for this case the vle32 instruction is the one that also works with
other VL, which means the vsetvl is too strict.  I agree
if the above is the input to LCM then it's difficult to make it work.
But then maybe initial placement should be only
done for strict cases.  Anyway - I'm just throwing in wild guesses here.

> It's obvious that neither "vsetvl e8,mf8,TA" nor "vsetvl e32 mf2,TU" are available for both instructions "vadd" and "vle".
> That's why we need Phase 3 in VSETVL PASS.
> I do the demand fusion generate a new vsetvl instructions "vsetvl e8,mf8,TU" which is available for both RVV instructions "vadd" and "vle",
> and update the first vsetvl "vsetvl e8,mf8,TA" to  "vsetvl e8,mf8,TU"
>
> Then, I tell LCM "vsetvl e8,mf8,TU" is available for both "vadd" and "vle32", so LCM will hoist "vsetvl e8,mf8,TU" outside the LOOP
> and remove all vsetvls inside the loop.
>
> ________________________________
> juzhe.zhong@rivai.ai
>
>
> From: Richard Biener
> Date: 2023-04-11 16:55
> To: juzhe.zhong
> CC: Jeff Law; gcc-patches; kito.cheng; palmer
> Subject: Re: Re: [PATCH] RISC-V: Fix PR108279
> On Wed, Apr 5, 2023 at 3:53 PM <juzhe.zhong@rivai.ai> wrote:
> >
> > >> So fusion in this context is really about identifying cases where two
> > >> configuration settings are equivalent and you "fuse" them together.
> > >> Presumably this is only going to be possible when the vector insns are
> > >> just doing data movement rather than actual computations?
> >
> > >> If my understanding is correct, I can kind of see why you're doing
> > >> fusion during phase 3.  My sense is there's a better way, but I'm having
> > >> a bit of trouble working out the details of what that should be to
> > >> myself.  In any event, revamping parts of the vsetvl insertion code
> > >> isn't the kind of thing we should be doing now.
> >
> > The vsetvl demand fusion happens is not necessary "equivalent", instead, we
> > call it we will do demand fusion when they are "compatible".
> > And the fusion can happen between any vector insns including data movement
> > and actual computations.
> >
> > What is "compatible" ??  This definition is according to RVV ISA.
> > For example , For a vadd.vv need a vsetvl zero, 4, e32,m1,ta,ma.
> > and a vle.v need a vsetvl zero,4,e8,mf4,ta,ma.
> >
> > According to RVV ISA:
> > vadd.vv demand SEW = 32, LMUL = M1, AVL = 4
> > vle.v demand RATIO = SEW/LMUL = 32, AVL = 4.
> > So after demand fusion, the demand becomes SEW = 32, LMUL = M1, AVL = 4.
> > Such vsetvl instruction is configured as this demand fusion, we call it "compatible"
> > since we can find a common vsetvl VL/VTYPE status for both vadd.vv and vle.v
> >
> > However, what case is not "incompatible", same example, if the vadd.vv demand SEW = 32. LMUL = MF2,
> > the vadd.vv is incompatible with vle.v. since we can't find a common VL/VTYPE vsetvl instruction available
> > for both of them.
> >
> > We have local demand fusion which is Phase 1. Local demand fusion is doing the fusion within a block
> > And also we have global demand fusion which is Phase 3. Global demand fusion is doing across blocks.
> >
> > After Phase 1, each block has a single demand fusion. Phase 3 is doing global demand fusion trying to
> > find the common VL/VTYPE status available for a bunch of blocks, and fuse them into a single vsetvl.
> > So that we eliminate redundant vsetvli.
> >
> > Here is a example:
> >
> >                                     bb 0:  (vle.v demand RATIO = 32)
> >                                   /       \
> >                             bb 1      bb 2
> >                           /      \     /       \
> >                  bb 3       bb 4  ....     bb 5
> >                vadd       vmul          vdiv
> >             (demand  (demand      (demand
> >              sew = 8,    sew = 8,      sew = 8,
> >         lmul = mf4)  lmul = mf4,   lmul = mf4,
> >                           tail policy = tu) mask policy = mu)
> >
> > So in this case, we should do the global demand fusion for bb 0, bb3, bb 4, bb5.
> > since they are compatible according to RVV ISA.
> > The final demand info of vsetvl should be vsetvl e8,mf4,tu,mu and put it
> > in the bb0. Then we can avoid vsetvl in bb 3, 4, 5.
>
> Just to throw in a comment here - I think you should present LCM with
> something it
> can identify as the same for compatible vsetvl and then it should just
> work?  OTOH
> if "compatible" is not transitive that's not possible (but then I
> can't quickly make up
> an example where it wouldn't be).
>
> > >> We have more fusion rules according to RVV ISA. Phase 3 (Global demand fusion) is
> > >> really important.
> >
> > >> That would seem to indicate the function is poorly named.  Unless you're
> > >> using "empty" here to mean the state is valid or dirty.  Either way it
> > >> seems like the function name ought to be improved.
> >
> > >> The comments talk about bb1 being inside a loop.  Nowhere do you check
> > >> that as far as I can tell.
> >
> > >> When trying to understand what the patch is going I ran across this comment:
> >
> > >>   /* The local_dem vector insn_info of the block.  */
> >  >>   vector_insn_info local_dem;
> >
> >
> > >> That comment really doesn't improve anything.  "local_dem" is clearly
> > >> short-hand for something (local demand?), whatever it is, make it
> > >> clearer in the comment.
> >
> > Sorry for bad comments in the codes. Currently, I am working on the first patch
> > of auto-vectorization. After I sent the first patch of auto-vectorization for you to
> > review. I would like to re-check all the comments and code style of VSETVL PASS.
> > And refine them.
> >
> >
> >
> >
> > juzhe.zhong@rivai.ai
> >
> > From: Jeff Law
> > Date: 2023-04-05 21:05
> > To: juzhe.zhong; gcc-patches
> > CC: kito.cheng; palmer
> > Subject: Re: [PATCH] RISC-V: Fix PR108279
> >
> >
> > On 4/2/23 16:40, juzhe.zhong@rivai.ai wrote:
> > > This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand
> > > info backward fusion and propogation) which
> > > is I introduced into VSETVL PASS to enhance LCM && improve vsetvl
> > > instruction performance.
> > So fusion in this context is really about identifying cases where two
> > configuration settings are equivalent and you "fuse" them together.
> > Presumably this is only going to be possible when the vector insns are
> > just doing data movement rather than actual computations?
> >
> > If my understanding is correct, I can kind of see why you're doing
> > fusion during phase 3.  My sense is there's a better way, but I'm having
> > a bit of trouble working out the details of what that should be to
> > myself.  In any event, revamping parts of the vsetvl insertion code
> > isn't the kind of thing we should be doing now.
> >
> >
> > WRT the actual patch.  Please put a function comment on the
> > all_empty_predecessor_p method. Something like this perhaps?
> >
> > /* Return TRUE if all the predecessors of CFG_BB have vsetvl
> >     state that is valid or dirty, FALSE otherwise.  */
> >
> >
> > That would seem to indicate the function is poorly named.  Unless you're
> > using "empty" here to mean the state is valid or dirty.  Either way it
> > seems like the function name ought to be improved.
> >
> > The comments talk about bb1 being inside a loop.  Nowhere do you check
> > that as far as I can tell.
> >
> > When trying to understand what the patch is going I ran across this comment:
> >
> >   /* The local_dem vector insn_info of the block.  */
> >    vector_insn_info local_dem;
> >
> >
> > That comment really doesn't improve anything.  "local_dem" is clearly
> > short-hand for something (local demand?), whatever it is, make it
> > clearer in the comment.
> >
> > Jeff
> >
>
  
juzhe.zhong@rivai.ai April 11, 2023, 11:35 a.m. UTC | #8
Oh, sorry for didn't explain this clearly.
In this example, the "vle32" should be using same VL as "vadd" that I didn't mention.

But we do have other instructions doesn't care about VL operand. So let me explain more clearly.
Same example I repeated again:
Loop:
{
bb 0:
vsetvl VL, e8,mf8,TA
vadd (Demand SEW = 8, LMUL = MF8, can either TU or TA, demand VL)
bb 1:
vsetvl VL, e32 mf2,TU
vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1], and demand TU, demand VL)
}
In this example, when the "VL" of vadd is the same as "VL" fo vle32, then we should do is generate a new vsetvl "vsetvl VL, e32,mf2,TU" outside the loop
then, remove those 2 vsetvl inside the loop.

Another example is that we do have some rvv instructions doesn't demand "VL":
Loop:
{
bb 0:
vsetvl e8,m1,TA (can be any VL, can be any LMUL)
vmv.x.s (Demand SEW = 8, LMUL default value = 1 (However, it's available also for all LMUL), tail policty can be either TU or TA, do not demand VL)
bb 1:
vsetvl VL, e32 mf2,TU
vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1], and demand TU, demand VL)
}

Then in this case, we should generate "vsetvl VL (this VL is from vle32.v), e8, mf8, TU outside the loop,
And eliminate those 2 instructions inside the loop.

Hmmm, may not be easy to understand since RVV instructions have various vsetvl configuration. 

Some instructions like vle32...load/stores, they demand RATIO = SEW/LMUL, meanning they don't need the exactly the SEW and LMUL just only SEW/LMUL ratio will be enough.
Some instructions like vadd, vsub,...they demand exactly SEW && LMUL to be exact value, the SEW/LMUL ratio is not enough.
Some instructions like comparison instructions, they don't care about tail policy....
Some instructions like vmv.x.s, doesn't care about VL.
....etc.
Quite complicated, so we have defined several fusion rules in VSETVL PASS....


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-04-11 19:19
To: juzhe.zhong@rivai.ai
CC: jeffreyalaw; gcc-patches; kito.cheng; palmer
Subject: Re: Re: [PATCH] RISC-V: Fix PR108279
On Tue, Apr 11, 2023 at 11:19 AM juzhe.zhong@rivai.ai
<juzhe.zhong@rivai.ai> wrote:
>
> No, we can only pass "available" to LCM.
> Passing "compatible" to LCM can not work for us.
>
> LCM can only help for eliminate vsetvls can not help for fuse vsetvls.
>
> For example:
>
> bb 0:
> vsetvl e8,mf8
> vadd (Demand SEW = 8, LMUL = MF8)
> bb 1:
> vsetvl e32 mf2
> vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1])
>
> I use LCM to handle the case above, I tell LCM that the vsetvl of vadd is "available" for the following "vle" instruction.
> Then LCM will let us to remove "vsetvl e32mf2"
> This is what I said "available" case that I use LCM to handle.
>
> However, LCM can not handle "compatible" case. Here is the example:
>
> Loop:
> {
> bb 0:
> vsetvl e8,mf8,TA
> vadd (Demand SEW = 8, LMUL = MF8, can either TU or TA)
> bb 1:
> vsetvl e32 mf2,TU
> vle32 (Demand RATIO (SEW/LMUL) = 64, so available [SEW, LMUL] = [e8,mf8 or e16,mf4 or e32,mf2 or e64,m1], and demand TU)
> }
 
So for this case the vle32 instruction is the one that also works with
other VL, which means the vsetvl is too strict.  I agree
if the above is the input to LCM then it's difficult to make it work.
But then maybe initial placement should be only
done for strict cases.  Anyway - I'm just throwing in wild guesses here.
 
> It's obvious that neither "vsetvl e8,mf8,TA" nor "vsetvl e32 mf2,TU" are available for both instructions "vadd" and "vle".
> That's why we need Phase 3 in VSETVL PASS.
> I do the demand fusion generate a new vsetvl instructions "vsetvl e8,mf8,TU" which is available for both RVV instructions "vadd" and "vle",
> and update the first vsetvl "vsetvl e8,mf8,TA" to  "vsetvl e8,mf8,TU"
>
> Then, I tell LCM "vsetvl e8,mf8,TU" is available for both "vadd" and "vle32", so LCM will hoist "vsetvl e8,mf8,TU" outside the LOOP
> and remove all vsetvls inside the loop.
>
> ________________________________
> juzhe.zhong@rivai.ai
>
>
> From: Richard Biener
> Date: 2023-04-11 16:55
> To: juzhe.zhong
> CC: Jeff Law; gcc-patches; kito.cheng; palmer
> Subject: Re: Re: [PATCH] RISC-V: Fix PR108279
> On Wed, Apr 5, 2023 at 3:53 PM <juzhe.zhong@rivai.ai> wrote:
> >
> > >> So fusion in this context is really about identifying cases where two
> > >> configuration settings are equivalent and you "fuse" them together.
> > >> Presumably this is only going to be possible when the vector insns are
> > >> just doing data movement rather than actual computations?
> >
> > >> If my understanding is correct, I can kind of see why you're doing
> > >> fusion during phase 3.  My sense is there's a better way, but I'm having
> > >> a bit of trouble working out the details of what that should be to
> > >> myself.  In any event, revamping parts of the vsetvl insertion code
> > >> isn't the kind of thing we should be doing now.
> >
> > The vsetvl demand fusion happens is not necessary "equivalent", instead, we
> > call it we will do demand fusion when they are "compatible".
> > And the fusion can happen between any vector insns including data movement
> > and actual computations.
> >
> > What is "compatible" ??  This definition is according to RVV ISA.
> > For example , For a vadd.vv need a vsetvl zero, 4, e32,m1,ta,ma.
> > and a vle.v need a vsetvl zero,4,e8,mf4,ta,ma.
> >
> > According to RVV ISA:
> > vadd.vv demand SEW = 32, LMUL = M1, AVL = 4
> > vle.v demand RATIO = SEW/LMUL = 32, AVL = 4.
> > So after demand fusion, the demand becomes SEW = 32, LMUL = M1, AVL = 4.
> > Such vsetvl instruction is configured as this demand fusion, we call it "compatible"
> > since we can find a common vsetvl VL/VTYPE status for both vadd.vv and vle.v
> >
> > However, what case is not "incompatible", same example, if the vadd.vv demand SEW = 32. LMUL = MF2,
> > the vadd.vv is incompatible with vle.v. since we can't find a common VL/VTYPE vsetvl instruction available
> > for both of them.
> >
> > We have local demand fusion which is Phase 1. Local demand fusion is doing the fusion within a block
> > And also we have global demand fusion which is Phase 3. Global demand fusion is doing across blocks.
> >
> > After Phase 1, each block has a single demand fusion. Phase 3 is doing global demand fusion trying to
> > find the common VL/VTYPE status available for a bunch of blocks, and fuse them into a single vsetvl.
> > So that we eliminate redundant vsetvli.
> >
> > Here is a example:
> >
> >                                     bb 0:  (vle.v demand RATIO = 32)
> >                                   /       \
> >                             bb 1      bb 2
> >                           /      \     /       \
> >                  bb 3       bb 4  ....     bb 5
> >                vadd       vmul          vdiv
> >             (demand  (demand      (demand
> >              sew = 8,    sew = 8,      sew = 8,
> >         lmul = mf4)  lmul = mf4,   lmul = mf4,
> >                           tail policy = tu) mask policy = mu)
> >
> > So in this case, we should do the global demand fusion for bb 0, bb3, bb 4, bb5.
> > since they are compatible according to RVV ISA.
> > The final demand info of vsetvl should be vsetvl e8,mf4,tu,mu and put it
> > in the bb0. Then we can avoid vsetvl in bb 3, 4, 5.
>
> Just to throw in a comment here - I think you should present LCM with
> something it
> can identify as the same for compatible vsetvl and then it should just
> work?  OTOH
> if "compatible" is not transitive that's not possible (but then I
> can't quickly make up
> an example where it wouldn't be).
>
> > >> We have more fusion rules according to RVV ISA. Phase 3 (Global demand fusion) is
> > >> really important.
> >
> > >> That would seem to indicate the function is poorly named.  Unless you're
> > >> using "empty" here to mean the state is valid or dirty.  Either way it
> > >> seems like the function name ought to be improved.
> >
> > >> The comments talk about bb1 being inside a loop.  Nowhere do you check
> > >> that as far as I can tell.
> >
> > >> When trying to understand what the patch is going I ran across this comment:
> >
> > >>   /* The local_dem vector insn_info of the block.  */
> >  >>   vector_insn_info local_dem;
> >
> >
> > >> That comment really doesn't improve anything.  "local_dem" is clearly
> > >> short-hand for something (local demand?), whatever it is, make it
> > >> clearer in the comment.
> >
> > Sorry for bad comments in the codes. Currently, I am working on the first patch
> > of auto-vectorization. After I sent the first patch of auto-vectorization for you to
> > review. I would like to re-check all the comments and code style of VSETVL PASS.
> > And refine them.
> >
> >
> >
> >
> > juzhe.zhong@rivai.ai
> >
> > From: Jeff Law
> > Date: 2023-04-05 21:05
> > To: juzhe.zhong; gcc-patches
> > CC: kito.cheng; palmer
> > Subject: Re: [PATCH] RISC-V: Fix PR108279
> >
> >
> > On 4/2/23 16:40, juzhe.zhong@rivai.ai wrote:
> > > This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand
> > > info backward fusion and propogation) which
> > > is I introduced into VSETVL PASS to enhance LCM && improve vsetvl
> > > instruction performance.
> > So fusion in this context is really about identifying cases where two
> > configuration settings are equivalent and you "fuse" them together.
> > Presumably this is only going to be possible when the vector insns are
> > just doing data movement rather than actual computations?
> >
> > If my understanding is correct, I can kind of see why you're doing
> > fusion during phase 3.  My sense is there's a better way, but I'm having
> > a bit of trouble working out the details of what that should be to
> > myself.  In any event, revamping parts of the vsetvl insertion code
> > isn't the kind of thing we should be doing now.
> >
> >
> > WRT the actual patch.  Please put a function comment on the
> > all_empty_predecessor_p method. Something like this perhaps?
> >
> > /* Return TRUE if all the predecessors of CFG_BB have vsetvl
> >     state that is valid or dirty, FALSE otherwise.  */
> >
> >
> > That would seem to indicate the function is poorly named.  Unless you're
> > using "empty" here to mean the state is valid or dirty.  Either way it
> > seems like the function name ought to be improved.
> >
> > The comments talk about bb1 being inside a loop.  Nowhere do you check
> > that as far as I can tell.
> >
> > When trying to understand what the patch is going I ran across this comment:
> >
> >   /* The local_dem vector insn_info of the block.  */
> >    vector_insn_info local_dem;
> >
> >
> > That comment really doesn't improve anything.  "local_dem" is clearly
> > short-hand for something (local demand?), whatever it is, make it
> > clearer in the comment.
> >
> > Jeff
> >
>
  
Jeff Law April 11, 2023, 9:14 p.m. UTC | #9
On 4/11/23 02:55, Richard Biener wrote:

> 
> Just to throw in a comment here - I think you should present LCM
> with something it can identify as the same for compatible vsetvl and
> then it should just work?  OTOH if "compatible" is not transitive
> that's not possible (but then I can't quickly make up an example
> where it wouldn't be).
I'm not sure it's that simple.  Or at least not with a single iteration 
of LCM.

One problem is that kills may affecting one setting, but not the other. 
I couldn't mentally come up with a single pass LCM to handle the case 
Juzhe was handling.  ie, you may have two compatible settings where you 
can unify them and hoist the compatible setting to a less executed 
point.  But the transp set for one of two compatible settings may be 
different for the other compatible setting because of vector 
instructions in a block.

What was starting to form was a two pass approach.  One which worked 
with individual vsetvl settings, another which worked on unified vsetvl 
settings.  It wasn't clear to me which ordering would be better, but I 
didn't work through the likely scenarios -- it was clear this wasn't the 
time to introduce that kind of conceptual change.

jeff
  
juzhe.zhong@rivai.ai April 11, 2023, 11:09 p.m. UTC | #10
I don't want to seperate VSETVL PASS into 2 seperate PASS.
I want make everything cleaner.

Another example is VSETVL PASS can do the branch prediction:
https://godbolt.org/z/K44r98E5v 
In function "f", you can see we put the hoist vsetvl from a more likely block (i !=cond) outside the loop,
then eliminate the vsetvl of this block. (Branch prediction is not that perfect in VSETVL PASS, I plan to 
optimize more when GCC 14 is open).

"f2" function is the normal fuse that we do in Phase 3.


juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-04-12 05:14
To: Richard Biener; juzhe.zhong
CC: gcc-patches; kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Fix PR108279
 
 
On 4/11/23 02:55, Richard Biener wrote:
 
> 
> Just to throw in a comment here - I think you should present LCM
> with something it can identify as the same for compatible vsetvl and
> then it should just work?  OTOH if "compatible" is not transitive
> that's not possible (but then I can't quickly make up an example
> where it wouldn't be).
I'm not sure it's that simple.  Or at least not with a single iteration 
of LCM.
 
One problem is that kills may affecting one setting, but not the other. 
I couldn't mentally come up with a single pass LCM to handle the case 
Juzhe was handling.  ie, you may have two compatible settings where you 
can unify them and hoist the compatible setting to a less executed 
point.  But the transp set for one of two compatible settings may be 
different for the other compatible setting because of vector 
instructions in a block.
 
What was starting to form was a two pass approach.  One which worked 
with individual vsetvl settings, another which worked on unified vsetvl 
settings.  It wasn't clear to me which ordering would be better, but I 
didn't work through the likely scenarios -- it was clear this wasn't the 
time to introduce that kind of conceptual change.
 
jeff
  
Jeff Law April 11, 2023, 11:11 p.m. UTC | #11
On 4/11/23 17:09, juzhe.zhong@rivai.ai wrote:
> I don't want to seperate VSETVL PASS into 2 seperate PASS.
> I want make everything cleaner.
Well, two pass vsetvl might actually be cleaner.  But as I've noted 
before, now is not the time to debate the vsetvl implementation detail. 
We've got much more important stuff to deal with.

Jeff
  
Jeff Law April 12, 2023, 11:18 p.m. UTC | #12
On 4/5/23 07:53, juzhe.zhong@rivai.ai wrote:
>  >> So fusion in this context is really about identifying cases where two
>>>configuration settings are equivalent and you "fuse" them together.
>>>Presumably this is only going to be possible when the vector insns are
>>>just doing data movement rather than actual computations?
> 
>>>If my understanding is correct, I can kind of see why you're doing
>>>fusion during phase 3.  My sense is there's a better way, but I'm having
>>>a bit of trouble working out the details of what that should be to
>>>myself.  In any event, revamping parts of the vsetvl insertion code
>>>isn't the kind of thing we should be doing now.
> 
> The vsetvl demand fusion happens is not necessary "equivalent", instead, we
> call it we will do demand fusion when they are "compatible".
> And the fusion can happen between any vector insns including data movement
> and actual computations.
I wasn't precise enough in my language, sorry about that.  "compatible" 
would definitely have been a better choice of words on my part.


> 
> What is "compatible" ??  This definition is according to RVV ISA.
> For example , For a vadd.vv need a vsetvl zero, 4, e32,m1,ta,ma.
> and a vle.v need a vsetvl zero,4,e8,mf4,ta,ma.
> 
> According to RVV ISA:
> vadd.vv demand SEW = 32, LMUL = M1, AVL = 4
> vle.v demand RATIO = SEW/LMUL = 32, AVL = 4.
> So after demand fusion, the demand becomes SEW = 32, LMUL = M1, AVL = 4.
> Such vsetvl instruction is configured as this demand fusion, we call it 
> "compatible"
> since we can find a common vsetvl VL/VTYPE status for both vadd.vv and vle.v
Thanks.  Yea, that makes sense.  Maybe a better way to state what I was 
thinking was that for pure data movement we have degrees of freedom to 
adjust the vector configuration to match something else and thus remove 
a vsetvl.

jeff
  
Jeff Law April 12, 2023, 11:23 p.m. UTC | #13
On 3/27/23 00:59, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
>          PR 108270
> 
> Fix bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.
> 
> Consider the following testcase:
> void f (void * restrict in, void * restrict out, int l, int n, int m)
> {
>    for (int i = 0; i < l; i++){
>      for (int j = 0; j < m; j++){
>        for (int k = 0; k < n; k++)
>          {
>            vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
>            __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
>          }
>      }
>    }
> }
> 
> Compile option: -O3
> 
> Before this patch:
> 	mv	a7,a2
> 	mv	a6,a0	
>          mv	t1,a1
> 	mv	a2,a3
> 	vsetivli	zero,17,e8,mf8,ta,ma
> ...
> 
> After this patch:
>          mv      a7,a2
>          mv      a6,a0
>          mv      t1,a1
>          mv      a2,a3
>          ble     a7,zero,.L1
>          ble     a4,zero,.L1
>          ble     a3,zero,.L1
>          add     a1,a0,a4
>          li      a0,0
>          vsetivli        zero,17,e8,mf8,ta,ma
> ...
> 
> It will produce potential bug when:
> 
> int main ()
> {
>    vsetivli zero, 100,.....
>    f (in, out, 0,0,0)
>    asm volatile ("csrr a0,vl":::"memory");
> 
>    // Before this patch the a0 is 17. (Wrong).
>    // After this patch the a0 is 100. (Correct).
>    ...
> }
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-vsetvl.cc (vector_infos_manager::all_empty_predecessor_p): New function.
>          (pass_vsetvl::backward_demand_fusion): Fix bug.
>          * config/riscv/riscv-vsetvl.h: New function declare.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c: Adapt test.
>          * gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c: Adapt test.
>          * gcc.target/riscv/rvv/vsetvl/pr108270.c: New test.
I've largely figured this out.  But I'd still recommend we wait for 
gcc-14.  The BZ is a missed optimization (poor placement of the vsetvl). 
   We can address is with your patch once gcc-13 branches.

Thanks for walking my through the implementation details.

Jeff
  
Jeff Law April 22, 2023, 3:06 a.m. UTC | #14
On 3/27/23 00:59, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
>          PR 108270
> 
> Fix bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.
> 
> Consider the following testcase:
> void f (void * restrict in, void * restrict out, int l, int n, int m)
> {
>    for (int i = 0; i < l; i++){
>      for (int j = 0; j < m; j++){
>        for (int k = 0; k < n; k++)
>          {
>            vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
>            __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
>          }
>      }
>    }
> }
> 
> Compile option: -O3
> 
> Before this patch:
> 	mv	a7,a2
> 	mv	a6,a0	
>          mv	t1,a1
> 	mv	a2,a3
> 	vsetivli	zero,17,e8,mf8,ta,ma
> ...
> 
> After this patch:
>          mv      a7,a2
>          mv      a6,a0
>          mv      t1,a1
>          mv      a2,a3
>          ble     a7,zero,.L1
>          ble     a4,zero,.L1
>          ble     a3,zero,.L1
>          add     a1,a0,a4
>          li      a0,0
>          vsetivli        zero,17,e8,mf8,ta,ma
> ...
> 
> It will produce potential bug when:
> 
> int main ()
> {
>    vsetivli zero, 100,.....
>    f (in, out, 0,0,0)
>    asm volatile ("csrr a0,vl":::"memory");
> 
>    // Before this patch the a0 is 17. (Wrong).
>    // After this patch the a0 is 100. (Correct).
>    ...
> }
> 
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-vsetvl.cc (vector_infos_manager::all_empty_predecessor_p): New function.
>          (pass_vsetvl::backward_demand_fusion): Fix bug.
>          * config/riscv/riscv-vsetvl.h: New function declare.
> 
> gcc/testsuite/ChangeLog:
> 
>          * gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c: Adapt test.
>          * gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c: Adapt test.
>          * gcc.target/riscv/rvv/vsetvl/pr108270.c: New test.
> 
> ---
>   gcc/config/riscv/riscv-vsetvl.cc              | 24 +++++++++++++++++++
>   gcc/config/riscv/riscv-vsetvl.h               |  2 ++
>   .../riscv/rvv/vsetvl/imm_bb_prop-1.c          |  2 +-
>   .../riscv/rvv/vsetvl/imm_conflict-3.c         |  4 ++--
>   .../gcc.target/riscv/rvv/vsetvl/pr108270.c    | 19 +++++++++++++++
>   5 files changed, 48 insertions(+), 3 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
> 
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
> index b5f5301ea43..4948e5d4c5e 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -2361,6 +2361,21 @@ vector_infos_manager::all_same_ratio_p (sbitmap bitdata) const
>     return true;
>   }
>   
> +bool
> +vector_infos_manager::all_empty_predecessor_p (const basic_block cfg_bb) const
Needs a function comment.  Perhaps something like:

/* Return TRUE if CFG_BB's predecessors have no vector configuration
    state.  FALSE otherwise.  */

Which I think argues that the name isn't good.  Perhaps 
"no_vector_state_in_preds" would be a better name?



>   
> +      /* Fix PR108270:
> +
> +		bb 0 -> bb 1
> +	 We don't need to backward fuse VL/VTYPE info from bb 1 to bb 0
> +	 if bb 1 is not inside a loop and all predecessors of bb 0 are empty. */
> +      if (m_vector_manager->all_empty_predecessor_p (cfg_bb))
> +	continue;
Rather than "empty" I would say something about vector configuration 
state.  "empty" is much more likely to be interpreted as having no 
instructions or something similar, which isn't the property you're checking.



So I think making the minor comment/name changes and this will be fine. 
Please repost it though for a final ACK.

jeff
  

Patch

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index b5f5301ea43..4948e5d4c5e 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2361,6 +2361,21 @@  vector_infos_manager::all_same_ratio_p (sbitmap bitdata) const
   return true;
 }
 
+bool
+vector_infos_manager::all_empty_predecessor_p (const basic_block cfg_bb) const
+{
+  hash_set<basic_block> pred_cfg_bbs = get_all_predecessors (cfg_bb);
+  for (const basic_block pred_cfg_bb : pred_cfg_bbs)
+    {
+      const auto &pred_block_info = vector_block_infos[pred_cfg_bb->index];
+      if (!pred_block_info.local_dem.valid_or_dirty_p ()
+	  && !pred_block_info.reaching_out.valid_or_dirty_p ())
+	continue;
+      return false;
+    }
+  return true;
+}
+
 bool
 vector_infos_manager::all_same_avl_p (const basic_block cfg_bb,
 				      sbitmap bitdata) const
@@ -3118,6 +3133,14 @@  pass_vsetvl::backward_demand_fusion (void)
       if (!backward_propagate_worthwhile_p (cfg_bb, curr_block_info))
 	continue;
 
+      /* Fix PR108270:
+
+		bb 0 -> bb 1
+	 We don't need to backward fuse VL/VTYPE info from bb 1 to bb 0
+	 if bb 1 is not inside a loop and all predecessors of bb 0 are empty. */
+      if (m_vector_manager->all_empty_predecessor_p (cfg_bb))
+	continue;
+
       edge e;
       edge_iterator ei;
       /* Backward propagate to each predecessor.  */
@@ -3131,6 +3154,7 @@  pass_vsetvl::backward_demand_fusion (void)
 	    continue;
 	  if (e->src->index == ENTRY_BLOCK_PTR_FOR_FN (cfun)->index)
 	    continue;
+
 	  /* If prop is demand of vsetvl instruction and reaching doesn't demand
 	     AVL. We don't backward propagate since vsetvl instruction has no
 	     side effects.  */
diff --git a/gcc/config/riscv/riscv-vsetvl.h b/gcc/config/riscv/riscv-vsetvl.h
index 237381f7026..eec03d35071 100644
--- a/gcc/config/riscv/riscv-vsetvl.h
+++ b/gcc/config/riscv/riscv-vsetvl.h
@@ -450,6 +450,8 @@  public:
   /* Return true if all expression set in bitmap are same ratio.  */
   bool all_same_ratio_p (sbitmap) const;
 
+  bool all_empty_predecessor_p (const basic_block) const;
+
   void release (void);
   void create_bitmap_vectors (void);
   void free_bitmap_vectors (void);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c
index cd4ee7dd0d3..ed32a40f5e7 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c
@@ -29,4 +29,4 @@  void f (int8_t * restrict in, int8_t * restrict out, int n, int cond)
   }
 }
 
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c
index 1f7c0f036a2..2fa29c01dbc 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c
@@ -20,7 +20,7 @@  void f (int8_t * restrict in, int8_t * restrict out, int n, int cond)
   }
 }
 
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
 /* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
-/* { dg-final { scan-assembler-times {vsetivli} 1 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
 /* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0"  no-opts "-funroll-loops" no-opts "-g" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
new file mode 100644
index 00000000000..d2ae43bf263
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (void * restrict in, void * restrict out, int l, int n, int m)
+{
+  for (int i = 0; i < l; i++){
+    for (int j = 0; j < m; j++){
+      for (int k = 0; k < n; k++)
+        {
+          vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
+          __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
+        }
+    }
+  }
+}
+
+/* { dg-final { scan-assembler-not {mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+vsetivli} } } */