[committed] aarch64: Detect more consecutive MEMs

Message ID mptr1blp98y.fsf@arm.com
State Committed
Headers
Series [committed] aarch64: Detect more consecutive MEMs |

Commit Message

Richard Sandiford Nov. 12, 2021, 5:35 p.m. UTC
  For tests like:

    int res[2];
    void
    f1 (int x, int y)
    {
      res[0] = res[1] = x + y;
    }

we generated:

        add     w0, w0, w1
        adrp    x1, .LANCHOR0
        add     x2, x1, :lo12:.LANCHOR0
        str     w0, [x1, #:lo12:.LANCHOR0]
        str     w0, [x2, 4]
        ret

Using [x1, #:lo12:.LANCHOR0] for the first store prevented the
two stores being recognised as a pair.  However, the MEM_EXPR
and MEM_OFFSET information tell us that the MEMs really are
consecutive.  The peehole2 context then guarantees that the
first address is equivalent to [x2, 0].

While there: the reg_mentioned_p tests for loads were probably correct,
but seemed a bit indirect.  We're matching two consecutive loads,
so the thing we need to test is that the second MEM in the original
sequence doesn't depend on the result of the first load in the
original sequence.

Tested on aarch64-linux-gnu & applied.

Richard


gcc/
	* config/aarch64/aarch64.c: Include tree-dfa.h.
	(aarch64_check_consecutive_mems): New function that takes MEM_EXPR
	and MEM_OFFSET into account.
	(aarch64_swap_ldrstr_operands): Use it.
	(aarch64_operands_ok_for_ldpstp): Likewise.  Check that the
	address of the second memory doesn't depend on the result of
	the first load.

gcc/testsuite/
	* gcc.target/aarch64/stp_1.c: New test.
---
 gcc/config/aarch64/aarch64.c             | 156 +++++++++++++++--------
 gcc/testsuite/gcc.target/aarch64/stp_1.c |  29 +++++
 2 files changed, 133 insertions(+), 52 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_1.c
  

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a8f53b85d92..850288d0e01 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -79,6 +79,7 @@ 
 #include "tree-ssa-loop-niter.h"
 #include "fractional-cost.h"
 #include "rtlanal.h"
+#include "tree-dfa.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -24569,6 +24570,97 @@  aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
   return priority;
 }
 
+/* Check if *MEM1 and *MEM2 are consecutive memory references and,
+   if they are, try to make them use constant offsets from the same base
+   register.  Return true on success.  When returning true, set *REVERSED
+   to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
+static bool
+aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
+{
+  *reversed = false;
+  if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
+      || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
+    return false;
+
+  if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
+    return false;
+
+  auto size1 = MEM_SIZE (*mem1);
+  auto size2 = MEM_SIZE (*mem2);
+
+  rtx base1, base2, offset1, offset2;
+  extract_base_offset_in_addr (*mem1, &base1, &offset1);
+  extract_base_offset_in_addr (*mem2, &base2, &offset2);
+
+  /* Make sure at least one memory is in base+offset form.  */
+  if (!(base1 && offset1) && !(base2 && offset2))
+    return false;
+
+  /* If both mems already use the same base register, just check the
+     offsets.  */
+  if (base1 && base2 && rtx_equal_p (base1, base2))
+    {
+      if (!offset1 || !offset2)
+	return false;
+
+      if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
+	return true;
+
+      if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)))
+	{
+	  *reversed = true;
+	  return true;
+	}
+
+      return false;
+    }
+
+  /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
+     guarantee that the values are consecutive.  */
+  if (MEM_EXPR (*mem1)
+      && MEM_EXPR (*mem2)
+      && MEM_OFFSET_KNOWN_P (*mem1)
+      && MEM_OFFSET_KNOWN_P (*mem2))
+    {
+      poly_int64 expr_offset1;
+      poly_int64 expr_offset2;
+      tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
+						       &expr_offset1);
+      tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
+						       &expr_offset2);
+      if (!expr_base1
+	  || !expr_base2
+	  || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
+	return false;
+
+      expr_offset1 += MEM_OFFSET (*mem1);
+      expr_offset2 += MEM_OFFSET (*mem2);
+
+      if (known_eq (expr_offset1 + size1, expr_offset2))
+	;
+      else if (known_eq (expr_offset2 + size2, expr_offset1))
+	*reversed = true;
+      else
+	return false;
+
+      if (base2)
+	{
+	  rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
+				     expr_offset1 - expr_offset2);
+	  *mem1 = replace_equiv_address_nv (*mem1, addr1);
+	}
+      else
+	{
+	  rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
+				     expr_offset2 - expr_offset1);
+	  *mem2 = replace_equiv_address_nv (*mem2, addr2);
+	}
+      return true;
+    }
+
+  return false;
+}
+
 /* Given OPERANDS of consecutive load/store, check if we can merge
    them into ldp/stp.  LOAD is true if they are load instructions.
    MODE is the mode of memory operands.  */
@@ -24577,9 +24669,8 @@  bool
 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
 				machine_mode mode)
 {
-  HOST_WIDE_INT offval_1, offval_2, msize;
   enum reg_class rclass_1, rclass_2;
-  rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
+  rtx mem_1, mem_2, reg_1, reg_2;
 
   if (load)
     {
@@ -24590,6 +24681,8 @@  aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
       if (REGNO (reg_1) == REGNO (reg_2))
 	return false;
+      if (reg_overlap_mentioned_p (reg_1, mem_2))
+	return false;
     }
   else
     {
@@ -24613,40 +24706,13 @@  aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
     return false;
 
   /* Check if the addresses are in the form of [base+offset].  */
-  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
-  if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
-    return false;
-  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
-  if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
-    return false;
-
-  /* Check if the bases are same.  */
-  if (!rtx_equal_p (base_1, base_2))
+  bool reversed = false;
+  if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
     return false;
 
   /* The operands must be of the same size.  */
   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
-			 GET_MODE_SIZE (GET_MODE (mem_2))));
-
-  offval_1 = INTVAL (offset_1);
-  offval_2 = INTVAL (offset_2);
-  /* We should only be trying this for fixed-sized modes.  There is no
-     SVE LDP/STP instruction.  */
-  msize = GET_MODE_SIZE (mode).to_constant ();
-  /* Check if the offsets are consecutive.  */
-  if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
-    return false;
-
-  /* Check if the addresses are clobbered by load.  */
-  if (load)
-    {
-      if (reg_mentioned_p (reg_1, mem_1))
-	return false;
-
-      /* In increasing order, the last load can clobber the address.  */
-      if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
-	return false;
-    }
+			GET_MODE_SIZE (GET_MODE (mem_2))));
 
   /* One of the memory accesses must be a mempair operand.
      If it is not the first one, they need to be swapped by the
@@ -24677,27 +24743,13 @@  aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
 void
 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
 {
-  rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
-  HOST_WIDE_INT offval_1, offval_2;
-
-  if (load)
-    {
-      mem_1 = operands[1];
-      mem_2 = operands[3];
-    }
-  else
-    {
-      mem_1 = operands[0];
-      mem_2 = operands[2];
-    }
-
-  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
-  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
-
-  offval_1 = INTVAL (offset_1);
-  offval_2 = INTVAL (offset_2);
+  int mem_op = load ? 1 : 0;
+  bool reversed = false;
+  if (!aarch64_check_consecutive_mems (operands + mem_op,
+				       operands + mem_op + 2, &reversed))
+    gcc_unreachable ();
 
-  if (offval_1 > offval_2)
+  if (reversed)
     {
       /* Irrespective of whether this is a load or a store,
 	 we do the same swap.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_1.c b/gcc/testsuite/gcc.target/aarch64/stp_1.c
new file mode 100644
index 00000000000..c1a5e6b776d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_1.c
@@ -0,0 +1,29 @@ 
+/* { dg-options "-O -fpeephole2" } */
+
+int res[2];
+
+void
+f1 (int x, int y)
+{
+  res[0] = res[1] = x + y;
+}
+
+void
+f2 (int x, int y, int *res)
+{
+  res[0] = res[1] = x + y;
+}
+
+void
+f3 (int x, int y)
+{
+  res[1] = res[0] = x + y;
+}
+
+void
+f4 (int x, int y, int *res)
+{
+  res[1] = res[0] = x + y;
+}
+
+/* { dg-final { scan-assembler-times {\tstp\t} 4 } } */