diff mbox series

[PR103302] skip multi-word pre-move clobber during lra

Message ID	orfsr38xn8.fsf@lxoliva.fsfla.org
State	Committed
Commit	50e8b0c9bca6cdc57804f860ec5311b641753fbb
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org EE7423858D28 To: gcc-patches@gcc.gnu.org Subject: [PR103302] skip multi-word pre-move clobber during lra Organization: Free thinker, does not speak for AdaCore Date: Wed, 08 Dec 2021 02:37:47 -0300 Message-ID: <orfsr38xn8.fsf@lxoliva.fsfla.org> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/25.2 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Alexandre Oliva via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Alexandre Oliva <oliva@adacore.com> Cc: zsojka@seznam.cz, wilson@gcc.gnu.org Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	[PR103302] skip multi-word pre-move clobber during lra \| [PR103302] skip multi-word pre-move clobber during lra

Commit Message

Alexandre Oliva Dec. 8, 2021, 5:37 a.m. UTC

  If we emit clobbers before multi-word moves during lra, we get
confused if a copy ends up with input or output replaced with each
other: the clobber then kills the previous set, and it gets deleted.

This patch avoids emitting such clobbers when lra_in_progress.

Regstrapped on x86_64-linux-gnu.  Verified that, applied on a riscv64
compiler that failed the test, the asm statements are no longer dropped
in the reload dumps.  Running a x86_64-x-riscv64 regression testing now.
Ok to install?


for  gcc/ChangeLog

	PR target/103302
	expr.c (emit_move_multi_word): Skip clobber during lra.

for  gcc/testsuite/ChangeLog

	PR target/103302
	* gcc.target/riscv/pr103302.c: New.
---
 gcc/expr.c                                |    2 +
 gcc/testsuite/gcc.target/riscv/pr103302.c |   47 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr103302.c

Comments

Jeff Law Dec. 8, 2021, 11:12 p.m. UTC | #1

On 12/7/2021 10:37 PM, Alexandre Oliva via Gcc-patches wrote:
> If we emit clobbers before multi-word moves during lra, we get
> confused if a copy ends up with input or output replaced with each
> other: the clobber then kills the previous set, and it gets deleted.
>
> This patch avoids emitting such clobbers when lra_in_progress.
>
> Regstrapped on x86_64-linux-gnu.  Verified that, applied on a riscv64
> compiler that failed the test, the asm statements are no longer dropped
> in the reload dumps.  Running a x86_64-x-riscv64 regression testing now.
> Ok to install?
>
>
> for  gcc/ChangeLog
>
> 	PR target/103302
> 	expr.c (emit_move_multi_word): Skip clobber during lra.
>
> for  gcc/testsuite/ChangeLog
>
> 	PR target/103302
> 	* gcc.target/riscv/pr103302.c: New.
OK.  Nit in the ChangeLog.  You forgot a '*' before the expr.c entry.

jeff

Alexandre Oliva Dec. 9, 2021, 2:25 a.m. UTC | #2

On Dec  8, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:

> On 12/7/2021 10:37 PM, Alexandre Oliva via Gcc-patches wrote:

>> expr.c (emit_move_multi_word): Skip clobber during lra.

> OK.  Nit in the ChangeLog.  You forgot a '*' before the expr.c entry.

Thanks, fixed.  Here's what I'm installing momentarily.


[PR103302] skip multi-word pre-move clobber during lra

If we emit clobbers before multi-word moves during lra, we get
confused if a copy ends up with input or output replaced with each
other: the clobber then kills the previous set, and it gets deleted.

This patch avoids emitting such clobbers when lra_in_progress.


for  gcc/ChangeLog

	PR target/103302
	* expr.c (emit_move_multi_word): Skip clobber during lra.

for  gcc/testsuite/ChangeLog

	PR target/103302
	* gcc.target/riscv/pr103302.c: New.
---
 gcc/expr.c                                |    2 +
 gcc/testsuite/gcc.target/riscv/pr103302.c |   47 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr103302.c

diff --git a/gcc/expr.c b/gcc/expr.c
index b281525750978..0365625e7b835 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -3929,7 +3929,7 @@ emit_move_multi_word (machine_mode mode, rtx x, rtx y)
      hard regs shouldn't appear here except as return values.
      We never want to emit such a clobber after reload.  */
   if (x != y
-      && ! (reload_in_progress || reload_completed)
+      && ! (lra_in_progress || reload_in_progress || reload_completed)
       && need_clobber != 0)
     emit_clobber (x);
 
diff --git a/gcc/testsuite/gcc.target/riscv/pr103302.c b/gcc/testsuite/gcc.target/riscv/pr103302.c
new file mode 100644
index 0000000000000..822c408741645
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr103302.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-Og -fharden-compares -fno-tree-dce -fno-tree-fre " } */
+
+typedef unsigned char u8;
+typedef unsigned char __attribute__((__vector_size__ (32))) v256u8;
+typedef unsigned short __attribute__((__vector_size__ (32))) v256u16;
+typedef unsigned short __attribute__((__vector_size__ (64))) v512u16;
+typedef unsigned int u32;
+typedef unsigned int __attribute__((__vector_size__ (4))) v512u32;
+typedef unsigned long long __attribute__((__vector_size__ (32))) v256u64;
+typedef unsigned long long __attribute__((__vector_size__ (64))) v512u64;
+typedef unsigned __int128 __attribute__((__vector_size__ (32))) v256u128;
+typedef unsigned __int128 __attribute__((__vector_size__ (64))) v512u128;
+
+v512u16 g;
+
+void
+foo0 (u8 u8_0, v256u16 v256u16_0, v512u16 v512u16_0, u32 u32_0, v512u32,
+      v256u64 v256u64_0, v512u64 v512u64_0, v256u128 v256u128_0,
+      v512u128 v512u128_0)
+{
+  u32_0 <= (v512u128) (v512u128_0 != u8_0);
+  v512u64 v512u64_1 =
+    __builtin_shufflevector (v256u64_0, v512u64_0, 7, 8, 0, 9, 5, 0, 3, 1);
+  g = v512u16_0;
+  (v256u8) v256u16_0 + (v256u8) v256u128_0;
+}
+
+int
+main (void)
+{
+  foo0 (40, (v256u16)
+	{
+	}, (v512u16)
+	{
+	}, 0, (v512u32)
+	{
+	}, (v256u64)
+	{
+	}, (v512u64)
+	{
+	}, (v256u128)
+	{
+	}, (v512u128)
+	{
+	});
+}

Alexandre Oliva Dec. 9, 2021, 4:08 a.m. UTC | #3

On Dec  8, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:

>> expr.c (emit_move_multi_word): Skip clobber during lra.

> OK.

I found a similar pattern of issuing clobbers for multi-word moves, but
not when reload_in_progress, in expr.c:emit_move_complex_parts.  I don't
have a testcase, but I'm tempted to propose '!lra_in_progress &&' for it
as well.  Can you think of any reason not to?

I also see lots of uses of reload_in_progress in machine-dependent code,
and I suspect many cases involving enabling patterns or checking for
legitimate addresses might benefit from the addition of lra_in_progress,
but that's too many occurrences to try to make sense of :-(

Jeff Law Dec. 9, 2021, 6:03 a.m. UTC | #4

On 12/8/2021 9:08 PM, Alexandre Oliva wrote:
> On Dec  8, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>>> expr.c (emit_move_multi_word): Skip clobber during lra.
>> OK.
> I found a similar pattern of issuing clobbers for multi-word moves, but
> not when reload_in_progress, in expr.c:emit_move_complex_parts.  I don't
> have a testcase, but I'm tempted to propose '!lra_in_progress &&' for it
> as well.  Can you think of any reason not to?
The only reason I can think of is we're in stage3 :-)  It'd be a lot 
easier to green light that if we could trigger an issue.

>
>
> I also see lots of uses of reload_in_progress in machine-dependent code,
> and I suspect many cases involving enabling patterns or checking for
> legitimate addresses might benefit from the addition of lra_in_progress,
> but that's too many occurrences to try to make sense of :-(
Yea, very likely.

jeff

Alexandre Oliva Dec. 15, 2021, 8:22 a.m. UTC | #5

On Dec  9, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:

>> I found a similar pattern of issuing clobbers for multi-word moves, but
>> not when reload_in_progress, in expr.c:emit_move_complex_parts.  I don't
>> have a testcase, but I'm tempted to propose '!lra_in_progress &&' for it
>> as well.  Can you think of any reason not to?

> The only reason I can think of is we're in stage3 :-)  It'd be a lot
> easier to green light that if we could trigger an issue.

I have not found the cycles to try to construct a testcase to trigger
the issue, but before moving on, I have regstrapped this on
x86_64-linux-gnu, so, at least for now, I propose it for the next
release cycle.  Ok to install then?


[PR103302] skip multi-part clobber during lra for complex parts too

From: Alexandre Oliva <oliva@adacore.com>

As with the earlier patch, avoid emitting clobbers that we used to
avoid during reload also during LRA, now when moving complex
multi-part values.  We don't have a testcase for this one.


for  gcc/ChangeLog

	PR target/103302
	* expr.c (emit_move_complex_parts): Skip clobbers during lra.
---
 gcc/expr.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/expr.c b/gcc/expr.c
index 0365625e7b835..30d1735ec29ce 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -3736,7 +3736,7 @@ emit_move_complex_parts (rtx x, rtx y)
   /* Show the output dies here.  This is necessary for SUBREGs
      of pseudos since we cannot track their lifetimes correctly;
      hard regs shouldn't appear here except as return values.  */
-  if (!reload_completed && !reload_in_progress
+  if (!reload_completed && !reload_in_progress && !lra_in_progress
       && REG_P (x) && !reg_overlap_mentioned_p (x, y))
     emit_clobber (x);

Jeff Law Dec. 15, 2021, 4 p.m. UTC | #6

On 12/15/2021 1:22 AM, Alexandre Oliva wrote:
> On Dec  9, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>>> I found a similar pattern of issuing clobbers for multi-word moves, but
>>> not when reload_in_progress, in expr.c:emit_move_complex_parts.  I don't
>>> have a testcase, but I'm tempted to propose '!lra_in_progress &&' for it
>>> as well.  Can you think of any reason not to?
>> The only reason I can think of is we're in stage3 :-)  It'd be a lot
>> easier to green light that if we could trigger an issue.
> I have not found the cycles to try to construct a testcase to trigger
> the issue, but before moving on, I have regstrapped this on
> x86_64-linux-gnu, so, at least for now, I propose it for the next
> release cycle.  Ok to install then?
>
>
> [PR103302] skip multi-part clobber during lra for complex parts too
>
> From: Alexandre Oliva <oliva@adacore.com>
>
> As with the earlier patch, avoid emitting clobbers that we used to
> avoid during reload also during LRA, now when moving complex
> multi-part values.  We don't have a testcase for this one.
>
>
> for  gcc/ChangeLog
>
> 	PR target/103302
> 	* expr.c (emit_move_complex_parts): Skip clobbers during lra.
OK for the next cycle.
jeff

Alexandre Oliva Feb. 18, 2022, 11:27 p.m. UTC | #7

On Dec 15, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:

>> * expr.c (emit_move_complex_parts): Skip clobbers during lra.
> OK for the next cycle.

Thanks, but having looked into PR 104121, I withdraw this patch and also
the already-installed patch for PR 103302.  As I found out, LRA does
worse without the clobbers for multi-word moves, not only because the
clobbers shorten live ranges and enable earlier and better allocations,
but also because find_reload_regno_insns implicitly, possibly
unknowingly, relied on the clobbers to avoid the risk of an infinite
loop.

As noted in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104121#c11 with
the clobber, a multi-word reload, and the insn the reload applies to, we
get 4 insns, so find_reload_regno_insns avoids the loop.  Without the
clobber, a multi-word reload for either input or output makes for n==3,
so we enter the loop and don't ever exit it: we'll find first_insn
(input) or second_insn (output), but then we'll loop forever because we
won't iterate again on {prev,next}_insn, respectively, and the other
iterator won't find the other word reload.  We advance the other till
the end, but that's not enough for us to terminate the loop.

With the proposed patch reversal, we no longer hit the problem with the
v850 testcase in 104121, but I'm concerned we might still get an
infinite loop on ports whose input or output reloads might emit a pair
of insns without a clobber.

I see two ways to robustify it.  One is to find a complete reload
sequence:

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index c1d40ea2a14bd..ff1688917cbce 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1716,9 +1716,18 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
 	start_insn = lra_insn_recog_data[uid]->insn;
       n++;
     }
-  /* For reload pseudo we should have at most 3 insns referring for
-     it: input/output reload insns and the original insn.  */
-  if (n > 3)
+  /* For reload pseudo we should have at most 3 (sequences of) insns
+     referring for it: input/output reload insn sequences and the
+     original insn.  Each reload insn sequence may amount to multiple
+     insns, but we expect to find each of them contiguous, one before
+     start_insn, one after it.  We know start_insn is between the
+     sequences because it's the lowest-numbered insn, thus the first
+     we'll have found above.  The reload insns, emitted later, will
+     have been assigned higher insn uids.  If this assumption doesn't
+     hold, and there happen to be intervening reload insns for other
+     pseudos, we may end up returning false after searching to the end
+     in the wrong direction.  */
+  if (n > 1 + 2 * CEIL (lra_reg_info[regno].biggest_mode, UNITS_PER_WORD))
     return false;
   if (n > 1)
     {
@@ -1726,26 +1735,52 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
 	     next_insn = NEXT_INSN (start_insn);
 	   n != 1 && (prev_insn != NULL || next_insn != NULL); )
 	{
-	  if (prev_insn != NULL && first_insn == NULL)
+	  if (prev_insn != NULL)
 	    {
 	      if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
 				  INSN_UID (prev_insn)))
 		prev_insn = PREV_INSN (prev_insn);
 	      else
 		{
-		  first_insn = prev_insn;
-		  n--;
+		  /* A reload sequence may have multiple insns, but
+		     they must be contiguous.  */
+		  do
+		    {
+		      first_insn = prev_insn;
+		      n--;
+		      prev_insn = PREV_INSN (prev_insn);
+		    }
+		  while (n > 1 && prev_insn
+			 && bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
+					  INSN_UID (prev_insn)));
+		  /* After finding first_insn, we don't want to search
+		     backward any more, so set prev_insn to NULL so as
+		     to not loop indefinitely.  */
+		  prev_insn = NULL;
 		}
 	    }
-	  if (next_insn != NULL && second_insn == NULL)
+	  else if (next_insn != NULL)
 	    {
 	      if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
 				INSN_UID (next_insn)))
 		next_insn = NEXT_INSN (next_insn);
 	      else
 		{
-		  second_insn = next_insn;
-		  n--;
+		  /* A reload sequence may have multiple insns, but
+		     they must be contiguous.  */
+		  do
+		    {
+		      second_insn = next_insn;
+		      n--;
+		      next_insn = NEXT_INSN (next_insn);
+		    }
+		  while (n > 1 && next_insn
+			 && bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
+					  INSN_UID (next_insn)));
+		  /* After finding second_insn, we don't want to
+		     search forward any more, so set next_insn to NULL
+		     so as to not loop indefinitely.  */
+		  next_insn = NULL;
 		}
 	    }
 	}


the other is to just prevent the infinite loop, that will then return
false because n > 1 after the loop ends:

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index c1d40ea2a14bd..efd5f764a37a5 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1726,7 +1726,7 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
 	     next_insn = NEXT_INSN (start_insn);
 	   n != 1 && (prev_insn != NULL || next_insn != NULL); )
 	{
-	  if (prev_insn != NULL && first_insn == NULL)
+	  if (prev_insn != NULL)
 	    {
 	      if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
 				  INSN_UID (prev_insn)))
@@ -1735,9 +1735,10 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
 		{
 		  first_insn = prev_insn;
 		  n--;
+		  prev_insn = NULL;
 		}
 	    }
-	  if (next_insn != NULL && second_insn == NULL)
+	  if (next_insn != NULL)
 	    {
 	      if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
 				INSN_UID (next_insn)))
@@ -1746,6 +1747,7 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
 		{
 		  second_insn = next_insn;
 		  n--;
+		  next_insn = NULL;
 		}
 	    }
 	}


When it comes to the v850 testcase, one of them just moves the infinite
loop to lra(), as we never get past while(fails_p); the other hits
lra-assigns.cc:lra_assign (bool&)'s

  if (flag_checking
      && (lra_assignment_iter_after_spill
	  > LRA_MAX_ASSIGNMENT_ITERATION_NUMBER))
    internal_error
      ("maximum number of LRA assignment passes is achieved (%d)",
       LRA_MAX_ASSIGNMENT_ITERATION_NUMBER);

which would loop indefinitely too without flag_checking.

Neither solves the v850 problem, only restoring the clobber does,
because then, with shorter live ranges, allocation succeeds for the
reloads, and we don't even try to split -> spill their pseudos.

Would any of these patchlets make sense to pursue regardless?

Ok to revert commit r12-5852-g50e8b0c9bca6cdc57804f860ec5311b641753fbb

I'm going to get back to the drawing board as to pr103302, since the
problem there will likely resurface, possibly also on v850.


diff --git a/gcc/expr.cc b/gcc/expr.cc
index 63a4aa838dec0..b6ed54983fabf 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -3929,7 +3929,7 @@ emit_move_multi_word (machine_mode mode, rtx x, rtx y)
      hard regs shouldn't appear here except as return values.
      We never want to emit such a clobber after reload.  */
   if (x != y
-      && ! (lra_in_progress || reload_in_progress || reload_completed)
+      && ! (reload_in_progress || reload_completed)
       && need_clobber != 0)
     emit_clobber (x);

Richard Biener Feb. 21, 2022, 7:13 a.m. UTC | #8

On Sat, Feb 19, 2022 at 12:28 AM Alexandre Oliva via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Dec 15, 2021, Jeff Law <jeffreyalaw@gmail.com> wrote:
>
> >> * expr.c (emit_move_complex_parts): Skip clobbers during lra.
> > OK for the next cycle.
>
> Thanks, but having looked into PR 104121, I withdraw this patch and also
> the already-installed patch for PR 103302.  As I found out, LRA does
> worse without the clobbers for multi-word moves, not only because the
> clobbers shorten live ranges and enable earlier and better allocations,
> but also because find_reload_regno_insns implicitly, possibly
> unknowingly, relied on the clobbers to avoid the risk of an infinite
> loop.
>
> As noted in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104121#c11 with
> the clobber, a multi-word reload, and the insn the reload applies to, we
> get 4 insns, so find_reload_regno_insns avoids the loop.  Without the
> clobber, a multi-word reload for either input or output makes for n==3,
> so we enter the loop and don't ever exit it: we'll find first_insn
> (input) or second_insn (output), but then we'll loop forever because we
> won't iterate again on {prev,next}_insn, respectively, and the other
> iterator won't find the other word reload.  We advance the other till
> the end, but that's not enough for us to terminate the loop.
>
> With the proposed patch reversal, we no longer hit the problem with the
> v850 testcase in 104121, but I'm concerned we might still get an
> infinite loop on ports whose input or output reloads might emit a pair
> of insns without a clobber.
>
> I see two ways to robustify it.  One is to find a complete reload
> sequence:
>
> diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
> index c1d40ea2a14bd..ff1688917cbce 100644
> --- a/gcc/lra-assigns.cc
> +++ b/gcc/lra-assigns.cc
> @@ -1716,9 +1716,18 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
>         start_insn = lra_insn_recog_data[uid]->insn;
>        n++;
>      }
> -  /* For reload pseudo we should have at most 3 insns referring for
> -     it: input/output reload insns and the original insn.  */
> -  if (n > 3)
> +  /* For reload pseudo we should have at most 3 (sequences of) insns
> +     referring for it: input/output reload insn sequences and the
> +     original insn.  Each reload insn sequence may amount to multiple
> +     insns, but we expect to find each of them contiguous, one before
> +     start_insn, one after it.  We know start_insn is between the
> +     sequences because it's the lowest-numbered insn, thus the first
> +     we'll have found above.  The reload insns, emitted later, will
> +     have been assigned higher insn uids.  If this assumption doesn't
> +     hold, and there happen to be intervening reload insns for other
> +     pseudos, we may end up returning false after searching to the end
> +     in the wrong direction.  */
> +  if (n > 1 + 2 * CEIL (lra_reg_info[regno].biggest_mode, UNITS_PER_WORD))
>      return false;
>    if (n > 1)
>      {
> @@ -1726,26 +1735,52 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
>              next_insn = NEXT_INSN (start_insn);
>            n != 1 && (prev_insn != NULL || next_insn != NULL); )
>         {
> -         if (prev_insn != NULL && first_insn == NULL)
> +         if (prev_insn != NULL)
>             {
>               if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
>                                   INSN_UID (prev_insn)))
>                 prev_insn = PREV_INSN (prev_insn);
>               else
>                 {
> -                 first_insn = prev_insn;
> -                 n--;
> +                 /* A reload sequence may have multiple insns, but
> +                    they must be contiguous.  */
> +                 do
> +                   {
> +                     first_insn = prev_insn;
> +                     n--;
> +                     prev_insn = PREV_INSN (prev_insn);
> +                   }
> +                 while (n > 1 && prev_insn
> +                        && bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
> +                                         INSN_UID (prev_insn)));
> +                 /* After finding first_insn, we don't want to search
> +                    backward any more, so set prev_insn to NULL so as
> +                    to not loop indefinitely.  */
> +                 prev_insn = NULL;
>                 }
>             }
> -         if (next_insn != NULL && second_insn == NULL)
> +         else if (next_insn != NULL)
>             {
>               if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
>                                 INSN_UID (next_insn)))
>                 next_insn = NEXT_INSN (next_insn);
>               else
>                 {
> -                 second_insn = next_insn;
> -                 n--;
> +                 /* A reload sequence may have multiple insns, but
> +                    they must be contiguous.  */
> +                 do
> +                   {
> +                     second_insn = next_insn;
> +                     n--;
> +                     next_insn = NEXT_INSN (next_insn);
> +                   }
> +                 while (n > 1 && next_insn
> +                        && bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
> +                                         INSN_UID (next_insn)));
> +                 /* After finding second_insn, we don't want to
> +                    search forward any more, so set next_insn to NULL
> +                    so as to not loop indefinitely.  */
> +                 next_insn = NULL;
>                 }
>             }
>         }
>
>
> the other is to just prevent the infinite loop, that will then return
> false because n > 1 after the loop ends:
>
> diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
> index c1d40ea2a14bd..efd5f764a37a5 100644
> --- a/gcc/lra-assigns.cc
> +++ b/gcc/lra-assigns.cc
> @@ -1726,7 +1726,7 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
>              next_insn = NEXT_INSN (start_insn);
>            n != 1 && (prev_insn != NULL || next_insn != NULL); )
>         {
> -         if (prev_insn != NULL && first_insn == NULL)
> +         if (prev_insn != NULL)
>             {
>               if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
>                                   INSN_UID (prev_insn)))
> @@ -1735,9 +1735,10 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
>                 {
>                   first_insn = prev_insn;
>                   n--;
> +                 prev_insn = NULL;
>                 }
>             }
> -         if (next_insn != NULL && second_insn == NULL)
> +         if (next_insn != NULL)
>             {
>               if (! bitmap_bit_p (&lra_reg_info[regno].insn_bitmap,
>                                 INSN_UID (next_insn)))
> @@ -1746,6 +1747,7 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
>                 {
>                   second_insn = next_insn;
>                   n--;
> +                 next_insn = NULL;
>                 }
>             }
>         }
>
>
> When it comes to the v850 testcase, one of them just moves the infinite
> loop to lra(), as we never get past while(fails_p); the other hits
> lra-assigns.cc:lra_assign (bool&)'s
>
>   if (flag_checking
>       && (lra_assignment_iter_after_spill
>           > LRA_MAX_ASSIGNMENT_ITERATION_NUMBER))
>     internal_error
>       ("maximum number of LRA assignment passes is achieved (%d)",
>        LRA_MAX_ASSIGNMENT_ITERATION_NUMBER);
>
> which would loop indefinitely too without flag_checking.
>
> Neither solves the v850 problem, only restoring the clobber does,
> because then, with shorter live ranges, allocation succeeds for the
> reloads, and we don't even try to split -> spill their pseudos.
>
> Would any of these patchlets make sense to pursue regardless?
>
> Ok to revert commit r12-5852-g50e8b0c9bca6cdc57804f860ec5311b641753fbb

OK.  Please re-open the bug as appropriate.

> I'm going to get back to the drawing board as to pr103302, since the
> problem there will likely resurface, possibly also on v850.
>
>
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 63a4aa838dec0..b6ed54983fabf 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -3929,7 +3929,7 @@ emit_move_multi_word (machine_mode mode, rtx x, rtx y)
>       hard regs shouldn't appear here except as return values.
>       We never want to emit such a clobber after reload.  */
>    if (x != y
> -      && ! (lra_in_progress || reload_in_progress || reload_completed)
> +      && ! (reload_in_progress || reload_completed)
>        && need_clobber != 0)
>      emit_clobber (x);
>
>
>
> --
> Alexandre Oliva, happy hacker                https://FSFLA.org/blogs/lxo/
>    Free Software Activist                       GNU Toolchain Engineer
> Disinformation flourishes because many people care deeply about injustice
> but very few check the facts.  Ask me about <https://stallmansupport.org>

Alexandre Oliva Feb. 23, 2022, 10:39 p.m. UTC | #9

On Feb 21, 2022, Richard Biener <richard.guenther@gmail.com> wrote:

>> Ok to revert commit r12-5852-g50e8b0c9bca6cdc57804f860ec5311b641753fbb

> OK.  Please re-open the bug as appropriate.

Thanks.  I've reopened it.  Here's what I'm installing.  I'm not
reverting the testcase, since it stopped failing even before the patch
was put in.


Revert commit r12-5852-g50e8b0c9bca6cdc57804f860ec5311b641753fbb

The patch for PR103302 caused PR104121, and extended the live ranges
of LRA reloads.


for gcc/ChangeLog

	PR target/104121
	PR target/103302
	* expr.cc (emit_move_multi_word): Restore clobbers during LRA.
---
 gcc/expr.cc |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 35e40299753bb..5f7142b975ada 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -3929,7 +3929,7 @@ emit_move_multi_word (machine_mode mode, rtx x, rtx y)
      hard regs shouldn't appear here except as return values.
      We never want to emit such a clobber after reload.  */
   if (x != y
-      && ! (lra_in_progress || reload_in_progress || reload_completed)
+      && ! (reload_in_progress || reload_completed)
       && need_clobber != 0)
     emit_clobber (x);

Alexandre Oliva March 1, 2022, 8:15 p.m. UTC | #10

On Feb 23, 2022, Alexandre Oliva <oliva@adacore.com> wrote:

> On Feb 21, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>>> Ok to revert commit r12-5852-g50e8b0c9bca6cdc57804f860ec5311b641753fbb

>> OK.  Please re-open the bug as appropriate.

> Thanks.  I've reopened it.  Here's what I'm installing.  I'm not
> reverting the testcase, since it stopped failing even before the patch
> was put in.

And now here's a patch that fixes the underlying issue.


Undo multi-word optional reloads correctly

From: Alexandre Oliva <oliva@adacore.com>

Unlike e.g. remove_inheritance_pseudos, undo_optional_reloads didn't
deal with subregs, so instead of removing multi-word moves, it
replaced the reload pseudo with the original pseudo.  Besides the
redundant move, that retained the clobber of the dest, that starts a
multi-word move.  After the remap, the sequence that should have
become a no-op move starts by clobbering the original pseudo and then
moving its pieces onto themselves.  The problem is the clobber: it
makes earlier sets of the original pseudo to be regarded as dead: if
the optional reload sequence was an output reload, the insn for which
the output reload was attempted may be regarded as dead and deleted.

I've arranged for undo_optional_reloads to accept SUBREGs and use
get_regno, like remove_inheritance_pseudo, adjusted its insn-removal
loop to tolerate iterating over a removed clobber, and added logic to
catch any left-over reload clobbers that could trigger the problem.


for  gcc/ChangeLog

	* lra-constraints.cc (undo_optional_reloads): Recognize and
	drop insns of multi-word move sequences, tolerate removal
	iteration on an already-removed clobber, and refuse to
	substitute original pseudos into clobbers.
---
 gcc/lra-constraints.cc |   37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index b2c4590153c4c..5cd89e7eeddc2 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -7261,15 +7261,17 @@ undo_optional_reloads (void)
 	      continue;
 	    src = SET_SRC (set);
 	    dest = SET_DEST (set);
-	    if (! REG_P (src) || ! REG_P (dest))
+	    if ((! REG_P (src) && ! SUBREG_P (src))
+		|| (! REG_P (dest) && ! SUBREG_P (dest)))
 	      continue;
-	    if (REGNO (dest) == regno
+	    if (get_regno (dest) == (int) regno
 		/* Ignore insn for optional reloads itself.  */
-		&& REGNO (lra_reg_info[regno].restore_rtx) != REGNO (src)
+		&& (get_regno (lra_reg_info[regno].restore_rtx)
+		    != get_regno (src))
 		/* Check only inheritance on last inheritance pass.  */
-		&& (int) REGNO (src) >= new_regno_start
+		&& get_regno (src) >= new_regno_start
 		/* Check that the optional reload was inherited.  */
-		&& bitmap_bit_p (&lra_inheritance_pseudos, REGNO (src)))
+		&& bitmap_bit_p (&lra_inheritance_pseudos, get_regno (src)))
 	      {
 		keep_p = true;
 		break;
@@ -7291,18 +7293,22 @@ undo_optional_reloads (void)
       bitmap_copy (insn_bitmap, &lra_reg_info[regno].insn_bitmap);
       EXECUTE_IF_SET_IN_BITMAP (insn_bitmap, 0, uid, bi2)
 	{
+	  /* We may have already removed a clobber.  */
+	  if (!lra_insn_recog_data[uid])
+	    continue;
 	  insn = lra_insn_recog_data[uid]->insn;
 	  if ((set = single_set (insn)) != NULL_RTX)
 	    {
 	      src = SET_SRC (set);
 	      dest = SET_DEST (set);
-	      if (REG_P (src) && REG_P (dest)
-		  && ((REGNO (src) == regno
-		       && (REGNO (lra_reg_info[regno].restore_rtx)
-			   == REGNO (dest)))
-		      || (REGNO (dest) == regno
-			  && (REGNO (lra_reg_info[regno].restore_rtx)
-			      == REGNO (src)))))
+	      if ((REG_P (src) || SUBREG_P (src))
+		  && (REG_P (dest) || SUBREG_P (dest))
+		  && ((get_regno (src) == (int) regno
+		       && (get_regno (lra_reg_info[regno].restore_rtx)
+			   == get_regno (dest)))
+		      || (get_regno (dest) == (int) regno
+			  && (get_regno (lra_reg_info[regno].restore_rtx)
+			      == get_regno (src)))))
 		{
 		  if (lra_dump_file != NULL)
 		    {
@@ -7310,7 +7316,7 @@ undo_optional_reloads (void)
 			       INSN_UID (insn));
 		      dump_insn_slim (lra_dump_file, insn);
 		    }
-		  delete_move_and_clobber (insn, REGNO (dest));
+		  delete_move_and_clobber (insn, get_regno (dest));
 		  continue;
 		}
 	      /* We should not worry about generation memory-memory
@@ -7319,6 +7325,11 @@ undo_optional_reloads (void)
 		 we remove the inheritance pseudo and the optional
 		 reload.  */
 	    }
+	  if (GET_CODE (PATTERN (insn)) == CLOBBER
+	      && REG_P (SET_DEST (insn))
+	      && get_regno (SET_DEST (insn)) == (int) regno)
+	    /* Refuse to remap clobbers to preexisting pseudos.  */
+	    gcc_unreachable ();
 	  lra_substitute_pseudo_within_insn
 	    (insn, regno, lra_reg_info[regno].restore_rtx, false);
 	  lra_update_insn_regno_info (insn);

Alexandre Oliva March 2, 2022, 12:25 p.m. UTC | #11

On Mar  1, 2022, Alexandre Oliva <oliva@adacore.com> wrote:

> On Feb 23, 2022, Alexandre Oliva <oliva@adacore.com> wrote:
>> On Feb 21, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>>>> Ok to revert commit r12-5852-g50e8b0c9bca6cdc57804f860ec5311b641753fbb

>>> OK.  Please re-open the bug as appropriate.

>> Thanks.  I've reopened it.  Here's what I'm installing.  I'm not
>> reverting the testcase, since it stopped failing even before the patch
>> was put in.

> And now here's a patch that fixes the underlying issue.

Oops, I missed the important question:


> Undo multi-word optional reloads correctly

> Unlike e.g. remove_inheritance_pseudos, undo_optional_reloads didn't
> deal with subregs, so instead of removing multi-word moves, it
> replaced the reload pseudo with the original pseudo.  Besides the
> redundant move, that retained the clobber of the dest, that starts a
> multi-word move.  After the remap, the sequence that should have
> become a no-op move starts by clobbering the original pseudo and then
> moving its pieces onto themselves.  The problem is the clobber: it
> makes earlier sets of the original pseudo to be regarded as dead: if
> the optional reload sequence was an output reload, the insn for which
> the output reload was attempted may be regarded as dead and deleted.

> I've arranged for undo_optional_reloads to accept SUBREGs and use
> get_regno, like remove_inheritance_pseudo, adjusted its insn-removal
> loop to tolerate iterating over a removed clobber, and added logic to
> catch any left-over reload clobbers that could trigger the problem.

Regstrapped on x86_64-linux-gnu, also tested on various riscv and arm
targets (with gcc-11).  Ok to install?


> for  gcc/ChangeLog

> 	* lra-constraints.cc (undo_optional_reloads): Recognize and
> 	drop insns of multi-word move sequences, tolerate removal
> 	iteration on an already-removed clobber, and refuse to
> 	substitute original pseudos into clobbers.
> ---
>  gcc/lra-constraints.cc |   37 ++++++++++++++++++++++++-------------
>  1 file changed, 24 insertions(+), 13 deletions(-)

> diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
> index b2c4590153c4c..5cd89e7eeddc2 100644
> --- a/gcc/lra-constraints.cc
> +++ b/gcc/lra-constraints.cc
> @@ -7261,15 +7261,17 @@ undo_optional_reloads (void)
>  	      continue;
>  	    src = SET_SRC (set);
>  	    dest = SET_DEST (set);
> -	    if (! REG_P (src) || ! REG_P (dest))
> +	    if ((! REG_P (src) && ! SUBREG_P (src))
> +		|| (! REG_P (dest) && ! SUBREG_P (dest)))
>  	      continue;
> -	    if (REGNO (dest) == regno
> +	    if (get_regno (dest) == (int) regno
>  		/* Ignore insn for optional reloads itself.  */
> -		&& REGNO (lra_reg_info[regno].restore_rtx) != REGNO (src)
> +		&& (get_regno (lra_reg_info[regno].restore_rtx)
> +		    != get_regno (src))
>  		/* Check only inheritance on last inheritance pass.  */
> -		&& (int) REGNO (src) >= new_regno_start
> +		&& get_regno (src) >= new_regno_start
>  		/* Check that the optional reload was inherited.  */
> -		&& bitmap_bit_p (&lra_inheritance_pseudos, REGNO (src)))
> +		&& bitmap_bit_p (&lra_inheritance_pseudos, get_regno (src)))
>  	      {
>  		keep_p = true;
>  		break;
> @@ -7291,18 +7293,22 @@ undo_optional_reloads (void)
>        bitmap_copy (insn_bitmap, &lra_reg_info[regno].insn_bitmap);
>        EXECUTE_IF_SET_IN_BITMAP (insn_bitmap, 0, uid, bi2)
>  	{
> +	  /* We may have already removed a clobber.  */
> +	  if (!lra_insn_recog_data[uid])
> +	    continue;
>  	  insn = lra_insn_recog_data[uid]->insn;
>  	  if ((set = single_set (insn)) != NULL_RTX)
>  	    {
>  	      src = SET_SRC (set);
>  	      dest = SET_DEST (set);
> -	      if (REG_P (src) && REG_P (dest)
> -		  && ((REGNO (src) == regno
> -		       && (REGNO (lra_reg_info[regno].restore_rtx)
> -			   == REGNO (dest)))
> -		      || (REGNO (dest) == regno
> -			  && (REGNO (lra_reg_info[regno].restore_rtx)
> -			      == REGNO (src)))))
> +	      if ((REG_P (src) || SUBREG_P (src))
> +		  && (REG_P (dest) || SUBREG_P (dest))
> +		  && ((get_regno (src) == (int) regno
> +		       && (get_regno (lra_reg_info[regno].restore_rtx)
> +			   == get_regno (dest)))
> +		      || (get_regno (dest) == (int) regno
> +			  && (get_regno (lra_reg_info[regno].restore_rtx)
> +			      == get_regno (src)))))
>  		{
>  		  if (lra_dump_file != NULL)
>  		    {
> @@ -7310,7 +7316,7 @@ undo_optional_reloads (void)
>  			       INSN_UID (insn));
>  		      dump_insn_slim (lra_dump_file, insn);
>  		    }
> -		  delete_move_and_clobber (insn, REGNO (dest));
> +		  delete_move_and_clobber (insn, get_regno (dest));
>  		  continue;
>  		}
>  	      /* We should not worry about generation memory-memory
> @@ -7319,6 +7325,11 @@ undo_optional_reloads (void)
>  		 we remove the inheritance pseudo and the optional
>  		 reload.  */
>  	    }
> +	  if (GET_CODE (PATTERN (insn)) == CLOBBER
> +	      && REG_P (SET_DEST (insn))
> +	      && get_regno (SET_DEST (insn)) == (int) regno)
> +	    /* Refuse to remap clobbers to preexisting pseudos.  */
> +	    gcc_unreachable ();
>  	  lra_substitute_pseudo_within_insn
>  	    (insn, regno, lra_reg_info[regno].restore_rtx, false);
>  	  lra_update_insn_regno_info (insn);

Vladimir Makarov March 2, 2022, 2:21 p.m. UTC | #12

On 2022-03-02 07:25, Alexandre Oliva wrote:
> Regstrapped on x86_64-linux-gnu, also tested on various riscv and arm
> targets (with gcc-11).  Ok to install?
>
Yes.

Thank you on working this, Alex.

>> for  gcc/ChangeLog
>> 	* lra-constraints.cc (undo_optional_reloads): Recognize and
>> 	drop insns of multi-word move sequences, tolerate removal
>> 	iteration on an already-removed clobber, and refuse to
>> 	substitute original pseudos into clobbers.

diff mbox series

Patch

diff --git a/gcc/expr.c b/gcc/expr.c
index b281525750978..0365625e7b835 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -3929,7 +3929,7 @@  emit_move_multi_word (machine_mode mode, rtx x, rtx y)
      hard regs shouldn't appear here except as return values.
      We never want to emit such a clobber after reload.  */
   if (x != y
-      && ! (reload_in_progress || reload_completed)
+      && ! (lra_in_progress || reload_in_progress || reload_completed)
       && need_clobber != 0)
     emit_clobber (x);
 
diff --git a/gcc/testsuite/gcc.target/riscv/pr103302.c b/gcc/testsuite/gcc.target/riscv/pr103302.c
new file mode 100644
index 0000000000000..822c408741645
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr103302.c
@@ -0,0 +1,47 @@ 
+/* { dg-do run } */
+/* { dg-options "-Og -fharden-compares -fno-tree-dce -fno-tree-fre " } */
+
+typedef unsigned char u8;
+typedef unsigned char __attribute__((__vector_size__ (32))) v256u8;
+typedef unsigned short __attribute__((__vector_size__ (32))) v256u16;
+typedef unsigned short __attribute__((__vector_size__ (64))) v512u16;
+typedef unsigned int u32;
+typedef unsigned int __attribute__((__vector_size__ (4))) v512u32;
+typedef unsigned long long __attribute__((__vector_size__ (32))) v256u64;
+typedef unsigned long long __attribute__((__vector_size__ (64))) v512u64;
+typedef unsigned __int128 __attribute__((__vector_size__ (32))) v256u128;
+typedef unsigned __int128 __attribute__((__vector_size__ (64))) v512u128;
+
+v512u16 g;
+
+void
+foo0 (u8 u8_0, v256u16 v256u16_0, v512u16 v512u16_0, u32 u32_0, v512u32,
+      v256u64 v256u64_0, v512u64 v512u64_0, v256u128 v256u128_0,
+      v512u128 v512u128_0)
+{
+  u32_0 <= (v512u128) (v512u128_0 != u8_0);
+  v512u64 v512u64_1 =
+    __builtin_shufflevector (v256u64_0, v512u64_0, 7, 8, 0, 9, 5, 0, 3, 1);
+  g = v512u16_0;
+  (v256u8) v256u16_0 + (v256u8) v256u128_0;
+}
+
+int
+main (void)
+{
+  foo0 (40, (v256u16)
+	{
+	}, (v512u16)
+	{
+	}, 0, (v512u32)
+	{
+	}, (v256u64)
+	{
+	}, (v512u64)
+	{
+	}, (v256u128)
+	{
+	}, (v512u128)
+	{
+	});
+}