Fix PR 110042: ifcvt regression due to paradoxical subregs

Message ID 20230531043249.2561061-1-apinski@marvell.com
State Committed
Commit df0853d72d38247aed577a4511450c91794f2f06
Headers
Series Fix PR 110042: ifcvt regression due to paradoxical subregs |

Commit Message

Andrew Pinski May 31, 2023, 4:32 a.m. UTC
  After r14-1014-gc5df248509b489364c573e8, GCC started to emit
directly a zero_extract for `(t1&0x8)!=0`. This introduced
a small regression where ifcvt would not do the ifconversion
as there is now a paradoxical subreg in the dest which
was being rejected. Since paradoxical subreg set the whole
register, we can treat it as the same as a reg in the two places.

OK? Bootstrapped and tested on x86_64-linux-gnu and aarch64-linux-gnu.

gcc/ChangeLog:

	PR rtl-optimization/110042
	* ifcvt.cc (bbs_ok_for_cmove_arith): Allow paradoxical subregs.
	(bb_valid_for_noce_process_p): Strip the subreg for the SET_DEST.

gcc/testsuite/ChangeLog:

	PR rtl-optimization/110042
	* gcc.target/aarch64/csel_bfx_2.c: New test.
---
 gcc/ifcvt.cc                                  | 14 ++++++----
 gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c | 27 +++++++++++++++++++
 2 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
  

Comments

Richard Biener May 31, 2023, 7:26 a.m. UTC | #1
On Wed, May 31, 2023 at 6:34 AM Andrew Pinski via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> After r14-1014-gc5df248509b489364c573e8, GCC started to emit
> directly a zero_extract for `(t1&0x8)!=0`. This introduced
> a small regression where ifcvt would not do the ifconversion
> as there is now a paradoxical subreg in the dest which
> was being rejected. Since paradoxical subreg set the whole
> register, we can treat it as the same as a reg in the two places.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu and aarch64-linux-gnu.

OK I guess.   I vaguely remember SUBREG_PROMOTED_UNSIGNED_P
applies to non-paradoxical subregs but I might be swapping things - maybe
you remember better and whether that would cause any issues here?

Thanks,
Richard.

> gcc/ChangeLog:
>
>         PR rtl-optimization/110042
>         * ifcvt.cc (bbs_ok_for_cmove_arith): Allow paradoxical subregs.
>         (bb_valid_for_noce_process_p): Strip the subreg for the SET_DEST.
>
> gcc/testsuite/ChangeLog:
>
>         PR rtl-optimization/110042
>         * gcc.target/aarch64/csel_bfx_2.c: New test.
> ---
>  gcc/ifcvt.cc                                  | 14 ++++++----
>  gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c | 27 +++++++++++++++++++
>  2 files changed, 36 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
>
> diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
> index 868eda93251..0b180b4568f 100644
> --- a/gcc/ifcvt.cc
> +++ b/gcc/ifcvt.cc
> @@ -2022,7 +2022,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
>         }
>
>        /* Make sure this is a REG and not some instance
> -        of ZERO_EXTRACT or SUBREG or other dangerous stuff.
> +        of ZERO_EXTRACT or non-paradoxical SUBREG or other dangerous stuff.
>          If we have a memory destination then we have a pair of simple
>          basic blocks performing an operation of the form [addr] = c ? a : b.
>          bb_valid_for_noce_process_p will have ensured that these are
> @@ -2030,7 +2030,8 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
>          to be renamed.  Assert that the callers set this up properly.  */
>        if (MEM_P (SET_DEST (sset_b)))
>         gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
> -      else if (!REG_P (SET_DEST (sset_b)))
> +      else if (!REG_P (SET_DEST (sset_b))
> +              && !paradoxical_subreg_p (SET_DEST (sset_b)))
>         {
>           BITMAP_FREE (bba_sets);
>           return false;
> @@ -3136,14 +3137,17 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
>
>           rtx sset = single_set (insn);
>           gcc_assert (sset);
> +         rtx dest = SET_DEST (sset);
> +         if (SUBREG_P (dest))
> +           dest = SUBREG_REG (dest);
>
>           if (contains_mem_rtx_p (SET_SRC (sset))
> -             || !REG_P (SET_DEST (sset))
> -             || reg_overlap_mentioned_p (SET_DEST (sset), cond))
> +             || !REG_P (dest)
> +             || reg_overlap_mentioned_p (dest, cond))
>             goto free_bitmap_and_fail;
>
>           potential_cost += pattern_cost (sset, speed_p);
> -         bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
> +         bitmap_set_bit (test_bb_temps, REGNO (dest));
>         }
>      }
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c b/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
> new file mode 100644
> index 00000000000..c3b8a6f45cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +unsigned
> +f1(int t, int t1)
> +{
> +  int tt = 0;
> +  if(t)
> +    tt = (t1&0x8)!=0;
> +  return tt;
> +}
> +struct f
> +{
> +  unsigned t:3;
> +  unsigned t1:4;
> +};
> +unsigned
> +f2(int t, struct f y)
> +{
> +  int tt = 0;
> +  if(t)
> +    tt = y.t1;
> +  return tt;
> +}
> +/* Both f1 and f2 should produce a csel and not a cbz on the argument. */
> +/*  { dg-final { scan-assembler-times "csel\t" 2 } } */
> +/*  { dg-final { scan-assembler-times "ubfx\t" 2 } } */
> +/*  { dg-final { scan-assembler-not "cbz\t" } } */
> --
> 2.31.1
>
  
Andrew Pinski May 31, 2023, 9:22 p.m. UTC | #2
On Wed, May 31, 2023 at 12:29 AM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Wed, May 31, 2023 at 6:34 AM Andrew Pinski via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > After r14-1014-gc5df248509b489364c573e8, GCC started to emit
> > directly a zero_extract for `(t1&0x8)!=0`. This introduced
> > a small regression where ifcvt would not do the ifconversion
> > as there is now a paradoxical subreg in the dest which
> > was being rejected. Since paradoxical subreg set the whole
> > register, we can treat it as the same as a reg in the two places.
> >
> > OK? Bootstrapped and tested on x86_64-linux-gnu and aarch64-linux-gnu.
>
> OK I guess.   I vaguely remember SUBREG_PROMOTED_UNSIGNED_P
> applies to non-paradoxical subregs but I might be swapping things - maybe
> you remember better and whether that would cause any issues here?

So I looked into the history of the code in ifcvt.cc, this code was
added with r6-3071-ge65bf4e814d38c to accept more complex bb
(https://inbox.sourceware.org/gcc-patches/559FBB13.80009@arm.com/).
The thread where we start talking about subregs is located with Jeff's
email starting here:
https://inbox.sourceware.org/gcc-patches/55BBAFAC.5020608@redhat.com/ .

Jeff,
  I know Richard already approved this patch but could you provide a
second eye as you were involved reviewing the original code here and I
want to make sure I understood the code in a a reasonable fashion?

Thanks,
Andrew

>
> Thanks,
> Richard.
>
> > gcc/ChangeLog:
> >
> >         PR rtl-optimization/110042
> >         * ifcvt.cc (bbs_ok_for_cmove_arith): Allow paradoxical subregs.
> >         (bb_valid_for_noce_process_p): Strip the subreg for the SET_DEST.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR rtl-optimization/110042
> >         * gcc.target/aarch64/csel_bfx_2.c: New test.
> > ---
> >  gcc/ifcvt.cc                                  | 14 ++++++----
> >  gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c | 27 +++++++++++++++++++
> >  2 files changed, 36 insertions(+), 5 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
> >
> > diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
> > index 868eda93251..0b180b4568f 100644
> > --- a/gcc/ifcvt.cc
> > +++ b/gcc/ifcvt.cc
> > @@ -2022,7 +2022,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
> >         }
> >
> >        /* Make sure this is a REG and not some instance
> > -        of ZERO_EXTRACT or SUBREG or other dangerous stuff.
> > +        of ZERO_EXTRACT or non-paradoxical SUBREG or other dangerous stuff.
> >          If we have a memory destination then we have a pair of simple
> >          basic blocks performing an operation of the form [addr] = c ? a : b.
> >          bb_valid_for_noce_process_p will have ensured that these are
> > @@ -2030,7 +2030,8 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
> >          to be renamed.  Assert that the callers set this up properly.  */
> >        if (MEM_P (SET_DEST (sset_b)))
> >         gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
> > -      else if (!REG_P (SET_DEST (sset_b)))
> > +      else if (!REG_P (SET_DEST (sset_b))
> > +              && !paradoxical_subreg_p (SET_DEST (sset_b)))
> >         {
> >           BITMAP_FREE (bba_sets);
> >           return false;
> > @@ -3136,14 +3137,17 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
> >
> >           rtx sset = single_set (insn);
> >           gcc_assert (sset);
> > +         rtx dest = SET_DEST (sset);
> > +         if (SUBREG_P (dest))
> > +           dest = SUBREG_REG (dest);
> >
> >           if (contains_mem_rtx_p (SET_SRC (sset))
> > -             || !REG_P (SET_DEST (sset))
> > -             || reg_overlap_mentioned_p (SET_DEST (sset), cond))
> > +             || !REG_P (dest)
> > +             || reg_overlap_mentioned_p (dest, cond))
> >             goto free_bitmap_and_fail;
> >
> >           potential_cost += pattern_cost (sset, speed_p);
> > -         bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
> > +         bitmap_set_bit (test_bb_temps, REGNO (dest));
> >         }
> >      }
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c b/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
> > new file mode 100644
> > index 00000000000..c3b8a6f45cc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +unsigned
> > +f1(int t, int t1)
> > +{
> > +  int tt = 0;
> > +  if(t)
> > +    tt = (t1&0x8)!=0;
> > +  return tt;
> > +}
> > +struct f
> > +{
> > +  unsigned t:3;
> > +  unsigned t1:4;
> > +};
> > +unsigned
> > +f2(int t, struct f y)
> > +{
> > +  int tt = 0;
> > +  if(t)
> > +    tt = y.t1;
> > +  return tt;
> > +}
> > +/* Both f1 and f2 should produce a csel and not a cbz on the argument. */
> > +/*  { dg-final { scan-assembler-times "csel\t" 2 } } */
> > +/*  { dg-final { scan-assembler-times "ubfx\t" 2 } } */
> > +/*  { dg-final { scan-assembler-not "cbz\t" } } */
> > --
> > 2.31.1
> >
  
Jeff Law June 1, 2023, 2:36 p.m. UTC | #3
On 5/31/23 15:22, Andrew Pinski wrote:
> On Wed, May 31, 2023 at 12:29 AM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> On Wed, May 31, 2023 at 6:34 AM Andrew Pinski via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>>>
>>> After r14-1014-gc5df248509b489364c573e8, GCC started to emit
>>> directly a zero_extract for `(t1&0x8)!=0`. This introduced
>>> a small regression where ifcvt would not do the ifconversion
>>> as there is now a paradoxical subreg in the dest which
>>> was being rejected. Since paradoxical subreg set the whole
>>> register, we can treat it as the same as a reg in the two places.
>>>
>>> OK? Bootstrapped and tested on x86_64-linux-gnu and aarch64-linux-gnu.
>>
>> OK I guess.   I vaguely remember SUBREG_PROMOTED_UNSIGNED_P
>> applies to non-paradoxical subregs but I might be swapping things - maybe
>> you remember better and whether that would cause any issues here?
> 
> So I looked into the history of the code in ifcvt.cc, this code was
> added with r6-3071-ge65bf4e814d38c to accept more complex bb
> (https://inbox.sourceware.org/gcc-patches/559FBB13.80009@arm.com/).
> The thread where we start talking about subregs is located with Jeff's
> email starting here:
> https://inbox.sourceware.org/gcc-patches/55BBAFAC.5020608@redhat.com/ .
> 
> Jeff,
>    I know Richard already approved this patch but could you provide a
> second eye as you were involved reviewing the original code here and I
> want to make sure I understood the code in a a reasonable fashion?
It's been a while.   I think my original concerns were with RMW operands 
and making sure we tracked all sub-components of such operands correctly.

That was based on the original version, but that version looks like it 
should have been OK after reviewing the details of reg_referenced_p. 
It's since moved to DF.

So the only worry I immediately see is whether or not DF is giving uses 
and sets of sub-compenents of a RMW operand or multi-hard register modes.

Jeff
  
Andrew Pinski June 1, 2023, 9:02 p.m. UTC | #4
On Thu, Jun 1, 2023 at 7:36 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/31/23 15:22, Andrew Pinski wrote:
> > On Wed, May 31, 2023 at 12:29 AM Richard Biener via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> On Wed, May 31, 2023 at 6:34 AM Andrew Pinski via Gcc-patches
> >> <gcc-patches@gcc.gnu.org> wrote:
> >>>
> >>> After r14-1014-gc5df248509b489364c573e8, GCC started to emit
> >>> directly a zero_extract for `(t1&0x8)!=0`. This introduced
> >>> a small regression where ifcvt would not do the ifconversion
> >>> as there is now a paradoxical subreg in the dest which
> >>> was being rejected. Since paradoxical subreg set the whole
> >>> register, we can treat it as the same as a reg in the two places.
> >>>
> >>> OK? Bootstrapped and tested on x86_64-linux-gnu and aarch64-linux-gnu.
> >>
> >> OK I guess.   I vaguely remember SUBREG_PROMOTED_UNSIGNED_P
> >> applies to non-paradoxical subregs but I might be swapping things - maybe
> >> you remember better and whether that would cause any issues here?
> >
> > So I looked into the history of the code in ifcvt.cc, this code was
> > added with r6-3071-ge65bf4e814d38c to accept more complex bb
> > (https://inbox.sourceware.org/gcc-patches/559FBB13.80009@arm.com/).
> > The thread where we start talking about subregs is located with Jeff's
> > email starting here:
> > https://inbox.sourceware.org/gcc-patches/55BBAFAC.5020608@redhat.com/ .
> >
> > Jeff,
> >    I know Richard already approved this patch but could you provide a
> > second eye as you were involved reviewing the original code here and I
> > want to make sure I understood the code in a a reasonable fashion?
> It's been a while.   I think my original concerns were with RMW operands
> and making sure we tracked all sub-components of such operands correctly.
>
> That was based on the original version, but that version looks like it
> should have been OK after reviewing the details of reg_referenced_p.
> It's since moved to DF.
>
> So the only worry I immediately see is whether or not DF is giving uses
> and sets of sub-compenents of a RMW operand or multi-hard register modes.

So I looked into the code some more, for bb_valid_for_noce_process_p,
we are checking to make sure if a reg that was being set is not used
by the comparison (and it works using reg_overlap_mentioned_p that
satisfies the multi-hard register mode use case and satisfies the RMW
case as we are just checking to make sure it does not do that to the
registers of the comparison).

For bbs_ok_for_cmove_arith, having the check as paradoxical_subreg_p
(rather than a plain SUBREG) satisfies RMW operand case (as it is not
a RMW operation) and DF already handles multi-hard register when using
FOR_EACH_INSN_DEF/FOR_EACH_INSN_USE (and a paradoxical hard register
is an invalid RTL in the first place).

I wrote this up more to convince myself this is safe and for future
references for others when looking into the code to understand it.

Thanks,
Andrew

>
> Jeff
  

Patch

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 868eda93251..0b180b4568f 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -2022,7 +2022,7 @@  bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
 	}
 
       /* Make sure this is a REG and not some instance
-	 of ZERO_EXTRACT or SUBREG or other dangerous stuff.
+	 of ZERO_EXTRACT or non-paradoxical SUBREG or other dangerous stuff.
 	 If we have a memory destination then we have a pair of simple
 	 basic blocks performing an operation of the form [addr] = c ? a : b.
 	 bb_valid_for_noce_process_p will have ensured that these are
@@ -2030,7 +2030,8 @@  bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
 	 to be renamed.  Assert that the callers set this up properly.  */
       if (MEM_P (SET_DEST (sset_b)))
 	gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
-      else if (!REG_P (SET_DEST (sset_b)))
+      else if (!REG_P (SET_DEST (sset_b))
+	       && !paradoxical_subreg_p (SET_DEST (sset_b)))
 	{
 	  BITMAP_FREE (bba_sets);
 	  return false;
@@ -3136,14 +3137,17 @@  bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
 
 	  rtx sset = single_set (insn);
 	  gcc_assert (sset);
+	  rtx dest = SET_DEST (sset);
+	  if (SUBREG_P (dest))
+	    dest = SUBREG_REG (dest);
 
 	  if (contains_mem_rtx_p (SET_SRC (sset))
-	      || !REG_P (SET_DEST (sset))
-	      || reg_overlap_mentioned_p (SET_DEST (sset), cond))
+	      || !REG_P (dest)
+	      || reg_overlap_mentioned_p (dest, cond))
 	    goto free_bitmap_and_fail;
 
 	  potential_cost += pattern_cost (sset, speed_p);
-	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+	  bitmap_set_bit (test_bb_temps, REGNO (dest));
 	}
     }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c b/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
new file mode 100644
index 00000000000..c3b8a6f45cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/csel_bfx_2.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+unsigned
+f1(int t, int t1)
+{
+  int tt = 0;
+  if(t)
+    tt = (t1&0x8)!=0;
+  return tt;
+}
+struct f
+{
+  unsigned t:3;
+  unsigned t1:4;
+};
+unsigned
+f2(int t, struct f y)
+{
+  int tt = 0;
+  if(t)
+    tt = y.t1;
+  return tt;
+}
+/* Both f1 and f2 should produce a csel and not a cbz on the argument. */
+/*  { dg-final { scan-assembler-times "csel\t" 2 } } */
+/*  { dg-final { scan-assembler-times "ubfx\t" 2 } } */
+/*  { dg-final { scan-assembler-not "cbz\t" } } */