[aarch64] Code-gen for vector initialization involving constants

Message ID CAAgBjMnwGk4fOc3PTM_agTXXFvt=767a3-AWOfSr23Xja6K81w@mail.gmail.com
State New
Headers
Series [aarch64] Code-gen for vector initialization involving constants |

Commit Message

Prathamesh Kulkarni Feb. 3, 2023, 7:16 a.m. UTC
  Hi Richard,
While digging thru aarch64_expand_vector_init, I noticed it gives
priority to loading a constant first:
 /* Initialise a vector which is part-variable.  We want to first try
     to build those lanes which are constant in the most efficient way we
     can.  */

which results in suboptimal code-gen for following case:
int16x8_t f_s16(int16_t x)
{
  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
}

code-gen trunk:
f_s16:
        movi    v0.8h, 0x1
        ins     v0.h[0], w0
        ins     v0.h[1], w0
        ins     v0.h[2], w0
        ins     v0.h[3], w0
        ins     v0.h[4], w0
        ins     v0.h[5], w0
        ins     v0.h[6], w0
        ret

The attached patch tweaks the following condition:
if (n_var == n_elts && n_elts <= 16)
  {
    ...
  }

to pass if maxv >= 80% of n_elts, with 80% being an
arbitrary "high enough" threshold. The intent is to dup
the most repeating variable if it it's repetition
is "high enough" and insert constants which should be "better" than
loading constant first and inserting variables like in the above case.

Alternatively, I suppose we can remove threshold and for constants,
generate both sequences and check which one is more
efficient ?

code-gen with patch:
f_s16:
        dup     v0.8h, w0
        movi    v1.4h, 0x1
        ins     v0.h[7], v1.h[0]
        ret

The patch is lightly tested to verify that vec[t]-init-*.c tests pass
with bootstrap+test
in progress.
Does this look OK ?

Thanks,
Prathamesh
  

Comments

Prathamesh Kulkarni Feb. 13, 2023, 6:28 a.m. UTC | #1
On Fri, 3 Feb 2023 at 12:46, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> Hi Richard,
> While digging thru aarch64_expand_vector_init, I noticed it gives
> priority to loading a constant first:
>  /* Initialise a vector which is part-variable.  We want to first try
>      to build those lanes which are constant in the most efficient way we
>      can.  */
>
> which results in suboptimal code-gen for following case:
> int16x8_t f_s16(int16_t x)
> {
>   return (int16x8_t) { x, x, x, x, x, x, x, 1 };
> }
>
> code-gen trunk:
> f_s16:
>         movi    v0.8h, 0x1
>         ins     v0.h[0], w0
>         ins     v0.h[1], w0
>         ins     v0.h[2], w0
>         ins     v0.h[3], w0
>         ins     v0.h[4], w0
>         ins     v0.h[5], w0
>         ins     v0.h[6], w0
>         ret
>
> The attached patch tweaks the following condition:
> if (n_var == n_elts && n_elts <= 16)
>   {
>     ...
>   }
>
> to pass if maxv >= 80% of n_elts, with 80% being an
> arbitrary "high enough" threshold. The intent is to dup
> the most repeating variable if it it's repetition
> is "high enough" and insert constants which should be "better" than
> loading constant first and inserting variables like in the above case.
>
> Alternatively, I suppose we can remove threshold and for constants,
> generate both sequences and check which one is more
> efficient ?
>
> code-gen with patch:
> f_s16:
>         dup     v0.8h, w0
>         movi    v1.4h, 0x1
>         ins     v0.h[7], v1.h[0]
>         ret
>
> The patch is lightly tested to verify that vec[t]-init-*.c tests pass
> with bootstrap+test
> in progress.
> Does this look OK ?
Hi Richard,
ping https://gcc.gnu.org/pipermail/gcc-patches/2023-February/611243.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
  

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index acc0cfe5f94..df33509c6e4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22079,30 +22079,36 @@  aarch64_expand_vector_init (rtx target, rtx vals)
      and matches[X][1] with the count of duplicate elements (if X is the
      earliest element which has duplicates).  */
 
-  if (n_var == n_elts && n_elts <= 16)
+  int matches[16][2] = {0};
+  for (int i = 0; i < n_elts; i++)
     {
-      int matches[16][2] = {0};
-      for (int i = 0; i < n_elts; i++)
+      for (int j = 0; j <= i; j++)
 	{
-	  for (int j = 0; j <= i; j++)
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
 	    {
-	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
-		{
-		  matches[i][0] = j;
-		  matches[j][1]++;
-		  break;
-		}
+	      matches[i][0] = j;
+	      matches[j][1]++;
+	      break;
 	    }
 	}
-      int maxelement = 0;
-      int maxv = 0;
-      for (int i = 0; i < n_elts; i++)
-	if (matches[i][1] > maxv)
-	  {
-	    maxelement = i;
-	    maxv = matches[i][1];
-	  }
+    }
 
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+    if (matches[i][1] > maxv)
+      {
+	maxelement = i;
+	maxv = matches[i][1];
+      }
+
+  rtx max_elem = XVECEXP (vals, 0, maxelement); 
+  if (n_elts <= 16
+      && ((n_var == n_elts)
+	   || (maxv >= (int)(0.8 * n_elts)
+	       && !CONST_INT_P (max_elem)
+	       && !CONST_DOUBLE_P (max_elem))))
+    {
       /* Create a duplicate of the most common element, unless all elements
 	 are equally useless to us, in which case just immediately set the
 	 vector register using the first element.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
new file mode 100644
index 00000000000..e20b813559e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -0,0 +1,53 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f1_s16:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
+**	ins	v[0-9]+\.h\[7\], v[0-9]+\.h\[0\]
+**	...
+**	ret
+*/
+
+int16x8_t f1_s16(int16_t x)
+{
+  return (int16x8_t) {x, x, x, x, x, x, x, 1};
+}
+
+/*
+** f2_s16:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
+**	movi	v[0-9]+\.4h, 0x2
+**	ins	v[0-9]+\.h\[6\], v[0-9]+\.h\[0\]
+**	ins	v[0-9]+\.h\[7\], v[0-9]+\.h\[0\]
+**	...
+**	ret
+*/
+
+int16x8_t f2_s16(int16_t x)
+{
+  return (int16x8_t) { x, x, x, x, x, x, 1, 2 };
+}
+
+/*
+** f3_s16:
+**	...
+**	movi	v[0-9]+\.8h, 0x1
+**	ins	v[0-9]+\.h\[0\], w0
+**	ins	v[0-9]+\.h\[1\], w0
+**	ins	v[0-9]+\.h\[2\], w0
+**	...
+**	ret
+*/
+
+int16x8_t f3_s16(int16_t x)
+{
+  return (int16x8_t) {x, x, x, 1, 1, 1, 1, 1};
+}