Fix incorrect handle in vectorizable_induction for mixed induction type.

Message ID 20220920022133.64778-1-hongtao.liu@intel.com
State New
Headers
Series Fix incorrect handle in vectorizable_induction for mixed induction type. |

Commit Message

Liu, Hongtao Sept. 20, 2022, 2:21 a.m. UTC
  The codes in vectorizable_induction for slp_node assume all phi_info
have same induction type(vect_step_op_add), but since we support
nonlinear induction, it could be wrong handled.
So the patch return false when slp_node has mixed induction type.

Note codes in other place will still vectorize the induction with
separate iv update and vec_perm. But slp_node handle in
vectorizable_induction will be more optimal when all induction type
are the same, it will update ivs with one operation instead of
separate iv updates and permutation.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/103144
	* tree-vect-loop.cc (vectorizable_induction): Return false for
	slp_node with mixed induction type.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr103144-mix-1.c: New test.
	* gcc.target/i386/pr103144-mix-2.c: New test.
---
 .../gcc.target/i386/pr103144-mix-1.c          | 17 +++++++++
 .../gcc.target/i386/pr103144-mix-2.c          | 35 +++++++++++++++++++
 gcc/tree-vect-loop.cc                         | 34 ++++++++++++++----
 3 files changed, 79 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
  

Comments

Hongtao Liu Sept. 20, 2022, 2:28 a.m. UTC | #1
On Tue, Sep 20, 2022 at 10:23 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> The codes in vectorizable_induction for slp_node assume all phi_info
> have same induction type(vect_step_op_add), but since we support
> nonlinear induction, it could be wrong handled.
> So the patch return false when slp_node has mixed induction type.
>
> Note codes in other place will still vectorize the induction with
> separate iv update and vec_perm. But slp_node handle in
> vectorizable_induction will be more optimal when all induction type
> are the same, it will update ivs with one operation instead of
> separate iv updates and permutation.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR tree-optimization/103144
>         * tree-vect-loop.cc (vectorizable_induction): Return false for
>         slp_node with mixed induction type.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr103144-mix-1.c: New test.
>         * gcc.target/i386/pr103144-mix-2.c: New test.
> ---
>  .../gcc.target/i386/pr103144-mix-1.c          | 17 +++++++++
>  .../gcc.target/i386/pr103144-mix-2.c          | 35 +++++++++++++++++++
>  gcc/tree-vect-loop.cc                         | 34 ++++++++++++++----
>  3 files changed, 79 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
> new file mode 100644
> index 00000000000..b292d66ef71
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "optimized" } } */
> +/* For induction variable with differernt induction type(vect_step_op_add, vect_step_op_neg),
> +   It should't be handled in vectorizable_induction with just 1 single iv update(addition.),
> +   separate iv update and vec_perm are needed.  */
> +int
> +__attribute__((noipa))
> +foo (int* p, int c, int n)
> +{
> +  for (int i = 0; i != n; i++)
> +    {
> +      p[2* i]= i;
> +      p[2 * i+1] = c;
> +      c = -c;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
> new file mode 100644
> index 00000000000..b7043d59aec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-mix-1.c"
> +
> +typedef int v8si __attribute__((vector_size(32)));
> +
> +#define N 34
> +void
> +avx2_test (void)
> +{
> +  int* epi32_exp = (int*) malloc (N * sizeof (int));
> +  int* epi32_dst = (int*) malloc (N * sizeof (int));
> +
> +  __builtin_memset (epi32_exp, 0, N * sizeof (int));
> +  int b = 8;
> +  v8si init1 = __extension__(v8si) { 0, b, 1, -b, 2, b, 3, -b };
> +  v8si init2 = __extension__(v8si) { 4, b, 5, -b, 6, b, 7, -b };
> +  v8si init3 = __extension__(v8si) { 8, b, 9, -b, 10, b, 11, -b };
> +  v8si init4 = __extension__(v8si) { 12, b, 13, -b, 14, b, 15, -b };
> +  memcpy (epi32_exp, &init1, 32);
> +  memcpy (epi32_exp + 8, &init2, 32);
> +  memcpy (epi32_exp + 16, &init3, 32);
> +  memcpy (epi32_exp + 24, &init4, 32);
> +  epi32_exp[32] = 16;
> +  epi32_exp[33] = b;
> +  foo (epi32_dst, b, N / 2);
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 9c434b66c5b..c7050a47c1c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -9007,14 +9007,34 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>      iv_loop = loop;
>    gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
>
> -  if (slp_node && !nunits.is_constant ())
> +  if (slp_node)
>      {
> -      /* The current SLP code creates the step value element-by-element.  */
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "SLP induction not supported for variable-length"
> -                        " vectors.\n");
> -      return false;
> +      if (!nunits.is_constant ())
> +       {
> +         /* The current SLP code creates the step value element-by-element.  */
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "SLP induction not supported for variable-length"
> +                            " vectors.\n");
> +         return false;
> +       }
> +
> +      stmt_vec_info phi_info;
> +      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
> +       {
> +         if (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info) != vect_step_op_add)
> +           {
> +             /* The below SLP code assume all induction type to be the same.
> +                But slp in other place will still vectorize the loop via updating
> +                iv update separately + vec_perm, but not from below codes.  */
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "SLP induction not supported for mixed induction type"
> +                                " vectors.\n");
> +             return false;
> +           }
> +       }
> +
>      }
>
>    if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
> --
> 2.18.1
>
  
Richard Biener Sept. 21, 2022, 7:31 a.m. UTC | #2
On Tue, Sep 20, 2022 at 4:24 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The codes in vectorizable_induction for slp_node assume all phi_info
> have same induction type(vect_step_op_add), but since we support
> nonlinear induction, it could be wrong handled.
> So the patch return false when slp_node has mixed induction type.
>
> Note codes in other place will still vectorize the induction with
> separate iv update and vec_perm. But slp_node handle in
> vectorizable_induction will be more optimal when all induction type
> are the same, it will update ivs with one operation instead of
> separate iv updates and permutation.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR tree-optimization/103144
>         * tree-vect-loop.cc (vectorizable_induction): Return false for
>         slp_node with mixed induction type.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr103144-mix-1.c: New test.
>         * gcc.target/i386/pr103144-mix-2.c: New test.
> ---
>  .../gcc.target/i386/pr103144-mix-1.c          | 17 +++++++++
>  .../gcc.target/i386/pr103144-mix-2.c          | 35 +++++++++++++++++++
>  gcc/tree-vect-loop.cc                         | 34 ++++++++++++++----
>  3 files changed, 79 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
> new file mode 100644
> index 00000000000..b292d66ef71
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "optimized" } } */
> +/* For induction variable with differernt induction type(vect_step_op_add, vect_step_op_neg),
> +   It should't be handled in vectorizable_induction with just 1 single iv update(addition.),
> +   separate iv update and vec_perm are needed.  */
> +int
> +__attribute__((noipa))
> +foo (int* p, int c, int n)
> +{
> +  for (int i = 0; i != n; i++)
> +    {
> +      p[2* i]= i;
> +      p[2 * i+1] = c;
> +      c = -c;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
> new file mode 100644
> index 00000000000..b7043d59aec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr103144-mix-1.c"
> +
> +typedef int v8si __attribute__((vector_size(32)));
> +
> +#define N 34
> +void
> +avx2_test (void)
> +{
> +  int* epi32_exp = (int*) malloc (N * sizeof (int));
> +  int* epi32_dst = (int*) malloc (N * sizeof (int));
> +
> +  __builtin_memset (epi32_exp, 0, N * sizeof (int));
> +  int b = 8;
> +  v8si init1 = __extension__(v8si) { 0, b, 1, -b, 2, b, 3, -b };
> +  v8si init2 = __extension__(v8si) { 4, b, 5, -b, 6, b, 7, -b };
> +  v8si init3 = __extension__(v8si) { 8, b, 9, -b, 10, b, 11, -b };
> +  v8si init4 = __extension__(v8si) { 12, b, 13, -b, 14, b, 15, -b };
> +  memcpy (epi32_exp, &init1, 32);
> +  memcpy (epi32_exp + 8, &init2, 32);
> +  memcpy (epi32_exp + 16, &init3, 32);
> +  memcpy (epi32_exp + 24, &init4, 32);
> +  epi32_exp[32] = 16;
> +  epi32_exp[33] = b;
> +  foo (epi32_dst, b, N / 2);
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 9c434b66c5b..c7050a47c1c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -9007,14 +9007,34 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>      iv_loop = loop;
>    gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
>
> -  if (slp_node && !nunits.is_constant ())
> +  if (slp_node)
>      {
> -      /* The current SLP code creates the step value element-by-element.  */
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "SLP induction not supported for variable-length"
> -                        " vectors.\n");
> -      return false;
> +      if (!nunits.is_constant ())
> +       {
> +         /* The current SLP code creates the step value element-by-element.  */
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "SLP induction not supported for variable-length"
> +                            " vectors.\n");
> +         return false;
> +       }
> +
> +      stmt_vec_info phi_info;
> +      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
> +       {
> +         if (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info) != vect_step_op_add)

While it works to check this here the canonical place to match up def
types is in
vect_get_and_check_slp_defs.  There we do

      /* Not first stmt of the group, check that the def-stmt/s match
         the def-stmt/s of the first stmt.  Allow different definition
         types for reduction chains: the first stmt must be a
         vect_reduction_def (a phi node), and the rest
         end in the reduction chain.  */
      if ((!vect_def_types_match (oprnd_info->first_dt, dt)
           && !(oprnd_info->first_dt == vect_reduction_def
                && !STMT_VINFO_DATA_REF (stmt_info)

the induction def type match now needs to also compare
STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE
it seems.

Note we also fail to do this for STMT_VINFO_REDUC_TYPE (but that's eventually
only computed too late).

Being able to reject this during SLP discovery means that if you'd have

   x = i + j;
   y = j + i;

with two different induction types for 'i' and 'j', SLP discovery can
try commutative
swapping to match the inductions up for x and y and successfully SLP
the operation.

That said, I can see the checking condition becoming more ugly (please don't
try to amend vect_def_types_match itself), adding something like

    || (oprnd_info->first_dt == vect_induction_def
        && STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (...) != ... (stmts[0]))

Can you try doing that?

Thanks,
Richard.

> +           {
> +             /* The below SLP code assume all induction type to be the same.
> +                But slp in other place will still vectorize the loop via updating
> +                iv update separately + vec_perm, but not from below codes.  */
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "SLP induction not supported for mixed induction type"
> +                                " vectors.\n");
> +             return false;
> +           }
> +       }
> +
>      }
>
>    if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
> --
> 2.18.1
>
  

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
new file mode 100644
index 00000000000..b292d66ef71
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mix-1.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "optimized" } } */
+/* For induction variable with differernt induction type(vect_step_op_add, vect_step_op_neg),
+   It should't be handled in vectorizable_induction with just 1 single iv update(addition.),
+   separate iv update and vec_perm are needed.  */
+int
+__attribute__((noipa))
+foo (int* p, int c, int n)
+{
+  for (int i = 0; i != n; i++)
+    {
+      p[2* i]= i;
+      p[2 * i+1] = c;
+      c = -c;
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
new file mode 100644
index 00000000000..b7043d59aec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mix-2.c
@@ -0,0 +1,35 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr103144-mix-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+
+#define N 34
+void
+avx2_test (void)
+{
+  int* epi32_exp = (int*) malloc (N * sizeof (int));
+  int* epi32_dst = (int*) malloc (N * sizeof (int));
+
+  __builtin_memset (epi32_exp, 0, N * sizeof (int));
+  int b = 8;
+  v8si init1 = __extension__(v8si) { 0, b, 1, -b, 2, b, 3, -b };
+  v8si init2 = __extension__(v8si) { 4, b, 5, -b, 6, b, 7, -b };
+  v8si init3 = __extension__(v8si) { 8, b, 9, -b, 10, b, 11, -b };
+  v8si init4 = __extension__(v8si) { 12, b, 13, -b, 14, b, 15, -b };
+  memcpy (epi32_exp, &init1, 32);
+  memcpy (epi32_exp + 8, &init2, 32);
+  memcpy (epi32_exp + 16, &init3, 32);
+  memcpy (epi32_exp + 24, &init4, 32);
+  epi32_exp[32] = 16;
+  epi32_exp[33] = b;
+  foo (epi32_dst, b, N / 2);
+  if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9c434b66c5b..c7050a47c1c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9007,14 +9007,34 @@  vectorizable_induction (loop_vec_info loop_vinfo,
     iv_loop = loop;
   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
 
-  if (slp_node && !nunits.is_constant ())
+  if (slp_node)
     {
-      /* The current SLP code creates the step value element-by-element.  */
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "SLP induction not supported for variable-length"
-			 " vectors.\n");
-      return false;
+      if (!nunits.is_constant ())
+	{
+	  /* The current SLP code creates the step value element-by-element.  */
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "SLP induction not supported for variable-length"
+			     " vectors.\n");
+	  return false;
+	}
+
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
+	{
+	  if (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info) != vect_step_op_add)
+	    {
+	      /* The below SLP code assume all induction type to be the same.
+		 But slp in other place will still vectorize the loop via updating
+		 iv update separately + vec_perm, but not from below codes.  */
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "SLP induction not supported for mixed induction type"
+				 " vectors.\n");
+	      return false;
+	    }
+	}
+
     }
 
   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)