[1/2] PR96463 - aarch64 specific changes

Message ID CAAgBjM=meq6r-Yi3BJ6WWQ_eQHqV-u7nyN8NFmnxGefmeWp7hg@mail.gmail.com
State New
Headers
Series [1/2] PR96463 - aarch64 specific changes |

Commit Message

Prathamesh Kulkarni Dec. 17, 2021, 10:04 a.m. UTC
  Hi,
The patch folds:
lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
into:
lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
and expands above vec_perm_expr using aarch64_expand_sve_dupq.

With patch, for following test:
#include <arm_sve.h>
#include <arm_neon.h>

svint32_t
foo (int32x4_t x)
{
  return svld1rq (svptrue_b8 (), &x[0]);
}

it generates following code:
foo:
.LFB4350:
        dup     z0.q, z0.q[0]
        ret

and passes bootstrap+test on aarch64-linux-gnu.
But I am not sure if the changes to aarch64_evpc_sve_tbl
are correct.

Thanks,
Prathamesh
  

Comments

Richard Sandiford Dec. 17, 2021, 11:33 a.m. UTC | #1
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> Hi,
> The patch folds:
> lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> into:
> lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> and expands above vec_perm_expr using aarch64_expand_sve_dupq.
>
> With patch, for following test:
> #include <arm_sve.h>
> #include <arm_neon.h>
>
> svint32_t
> foo (int32x4_t x)
> {
>   return svld1rq (svptrue_b8 (), &x[0]);
> }
>
> it generates following code:
> foo:
> .LFB4350:
>         dup     z0.q, z0.q[0]
>         ret
>
> and passes bootstrap+test on aarch64-linux-gnu.
> But I am not sure if the changes to aarch64_evpc_sve_tbl
> are correct.

Just in case: I was only using int32x4_t in the PR as an example.
The same thing should work for all element types.

>
> Thanks,
> Prathamesh
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 02e42a71e5e..e21bbec360c 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -1207,6 +1207,56 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> +       into:
> +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> +       on little endian target.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0)
> +	&& TREE_CODE (arg1) == ADDR_EXPR)
> +      {
> +	tree t = TREE_OPERAND (arg1, 0);
> +	if (TREE_CODE (t) == ARRAY_REF)
> +	  {
> +	    tree index = TREE_OPERAND (t, 1);
> +	    t = TREE_OPERAND (t, 0);
> +	    if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> +	      {
> +		t = TREE_OPERAND (t, 0);
> +		tree vectype = TREE_TYPE (t);
> +		if (VECTOR_TYPE_P (vectype)
> +		    && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> +		    && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> +		  {

Since this is quite a specific pattern match, and since we now lower
arm_neon.h vld1* to normal gimple accesses, I think we should try the
“more generally” approach mentioned in the PR and see what the fallout
is.  That is, keep:

    if (!BYTES_BIG_ENDIAN
	&& integer_all_onesp (arg0)

If those conditions pass, create an Advanced SIMD access at address arg1,
using similar code to the handling of:

     BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
     BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
     BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)

in aarch64_general_gimple_fold_builtin.  (Would be good to move the
common code to aarch64.c so that both files can use it.)

> +		    tree lhs = gimple_call_lhs (f.call);
> +		    tree lhs_type = TREE_TYPE (lhs);
> +		    int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> +		    vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> +		    for (int i = 0; i < source_nelts; i++)
> +		      sel.quick_push (i);
> +
> +		    vec_perm_indices indices (sel, 1, source_nelts);
> +		    if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> +		      return NULL;

I don't think we need to check this: it should always be true.
Probably worth keeping as a gcc_checking_assert though.

> +
> +		    tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> +		    return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> +		  }
> +	      }
> +	  }
> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index f07330cff4f..af27f550be3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>  
>    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
>    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> +
>    if (d->one_vector_p)
> -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> +    {
> +      bool use_dupq = false;
> +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
> +      if (GET_CODE (sel) == CONST_VECTOR
> +	  && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> +	  && CONST_VECTOR_DUPLICATE_P (sel))
> +	  {
> +	    unsigned nelts = const_vector_encoded_nelts (sel);
> +	    unsigned i;
> +	    for (i = 0; i < nelts; i++)
> +	      {
> +		rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> +		if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> +		  break;
> +	      }
> +	    if (i == nelts)
> +	      use_dupq = true;
> +	  }
> +
> +      if (use_dupq)
> +	aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +      else
> +	emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> +    }

This shouldn't be a TBL but a new operation, handled by its own
aarch64_evpc_sve_* routine.  The check for the mask should then
be done on d->perm, to detect whether the permutation is one
that the new routine supports.

I think the requirements are:

- !BYTES_BIG_ENDIAN
- the source must be an Advanced SIMD vector
- the destination must be an SVE vector
- the permutation must be a duplicate (tested in the code above)
- the number of “patterns” in the permutation must equal the number of
  source elements
- element X of the permutation must equal X (tested in the code above)

The existing aarch64_evpc_* routines expect the source and target modes
to be the same, so we should only call them when that's true.

Thanks,
Richard
  
Prathamesh Kulkarni Dec. 27, 2021, 10:24 a.m. UTC | #2
On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > Hi,
> > The patch folds:
> > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > into:
> > lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> >
> > With patch, for following test:
> > #include <arm_sve.h>
> > #include <arm_neon.h>
> >
> > svint32_t
> > foo (int32x4_t x)
> > {
> >   return svld1rq (svptrue_b8 (), &x[0]);
> > }
> >
> > it generates following code:
> > foo:
> > .LFB4350:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > and passes bootstrap+test on aarch64-linux-gnu.
> > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > are correct.
>
> Just in case: I was only using int32x4_t in the PR as an example.
> The same thing should work for all element types.
>
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index 02e42a71e5e..e21bbec360c 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -1207,6 +1207,56 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> > +       into:
> > +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> > +       on little endian target.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0)
> > +     && TREE_CODE (arg1) == ADDR_EXPR)
> > +      {
> > +     tree t = TREE_OPERAND (arg1, 0);
> > +     if (TREE_CODE (t) == ARRAY_REF)
> > +       {
> > +         tree index = TREE_OPERAND (t, 1);
> > +         t = TREE_OPERAND (t, 0);
> > +         if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > +           {
> > +             t = TREE_OPERAND (t, 0);
> > +             tree vectype = TREE_TYPE (t);
> > +             if (VECTOR_TYPE_P (vectype)
> > +                 && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > +                 && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > +               {
>
> Since this is quite a specific pattern match, and since we now lower
> arm_neon.h vld1* to normal gimple accesses, I think we should try the
> “more generally” approach mentioned in the PR and see what the fallout
> is.  That is, keep:
>
>     if (!BYTES_BIG_ENDIAN
>         && integer_all_onesp (arg0)
>
> If those conditions pass, create an Advanced SIMD access at address arg1,
> using similar code to the handling of:
>
>      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
>      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
>      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
>
> in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> common code to aarch64.c so that both files can use it.)
>
> > +                 tree lhs = gimple_call_lhs (f.call);
> > +                 tree lhs_type = TREE_TYPE (lhs);
> > +                 int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > +                 vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > +                 for (int i = 0; i < source_nelts; i++)
> > +                   sel.quick_push (i);
> > +
> > +                 vec_perm_indices indices (sel, 1, source_nelts);
> > +                 if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> > +                   return NULL;
>
> I don't think we need to check this: it should always be true.
> Probably worth keeping as a gcc_checking_assert though.
>
> > +
> > +                 tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > +                 return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> > +               }
> > +           }
> > +       }
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index f07330cff4f..af27f550be3 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >
> >    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> >    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > +
> >    if (d->one_vector_p)
> > -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > +    {
> > +      bool use_dupq = false;
> > +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
> > +      if (GET_CODE (sel) == CONST_VECTOR
> > +       && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > +       && CONST_VECTOR_DUPLICATE_P (sel))
> > +       {
> > +         unsigned nelts = const_vector_encoded_nelts (sel);
> > +         unsigned i;
> > +         for (i = 0; i < nelts; i++)
> > +           {
> > +             rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> > +             if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> > +               break;
> > +           }
> > +         if (i == nelts)
> > +           use_dupq = true;
> > +       }
> > +
> > +      if (use_dupq)
> > +     aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +      else
> > +     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > +    }
>
> This shouldn't be a TBL but a new operation, handled by its own
> aarch64_evpc_sve_* routine.  The check for the mask should then
> be done on d->perm, to detect whether the permutation is one
> that the new routine supports.
>
> I think the requirements are:
>
> - !BYTES_BIG_ENDIAN
> - the source must be an Advanced SIMD vector
> - the destination must be an SVE vector
> - the permutation must be a duplicate (tested in the code above)
> - the number of “patterns” in the permutation must equal the number of
>   source elements
> - element X of the permutation must equal X (tested in the code above)
>
> The existing aarch64_evpc_* routines expect the source and target modes
> to be the same, so we should only call them when that's true.
Hi Richard,
Thanks for the suggestions, and sorry for late reply.
Does the following patch look OK (sans the refactoring of building mem_ref) ?
Passes bootstrap+test on aarch64-linux-gnu.

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 0d09fe9dd6d..656d39a741c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -47,6 +47,7 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "gimple-fold.h"
+#include "aarch64-builtins.h"
 
 #define v8qi_UP  E_V8QImode
 #define v8di_UP  E_V8DImode
@@ -128,46 +129,6 @@
 
 #define SIMD_MAX_BUILTIN_ARGS 5
 
-enum aarch64_type_qualifiers
-{
-  /* T foo.  */
-  qualifier_none = 0x0,
-  /* unsigned T foo.  */
-  qualifier_unsigned = 0x1, /* 1 << 0  */
-  /* const T foo.  */
-  qualifier_const = 0x2, /* 1 << 1  */
-  /* T *foo.  */
-  qualifier_pointer = 0x4, /* 1 << 2  */
-  /* Used when expanding arguments if an operand could
-     be an immediate.  */
-  qualifier_immediate = 0x8, /* 1 << 3  */
-  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
-  /* void foo (...).  */
-  qualifier_void = 0x20, /* 1 << 5  */
-  /* Some patterns may have internal operands, this qualifier is an
-     instruction to the initialisation code to skip this operand.  */
-  qualifier_internal = 0x40, /* 1 << 6  */
-  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
-     rather than using the type of the operand.  */
-  qualifier_map_mode = 0x80, /* 1 << 7  */
-  /* qualifier_pointer | qualifier_map_mode  */
-  qualifier_pointer_map_mode = 0x84,
-  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
-  qualifier_const_pointer_map_mode = 0x86,
-  /* Polynomial types.  */
-  qualifier_poly = 0x100,
-  /* Lane indices - must be in range, and flipped for bigendian.  */
-  qualifier_lane_index = 0x200,
-  /* Lane indices for single lane structure loads and stores.  */
-  qualifier_struct_load_store_lane_index = 0x400,
-  /* Lane indices selected in pairs. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_pair_index = 0x800,
-  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_quadtup_index = 0x1000,
-};
-
 /* Flags that describe what a function might do.  */
 const unsigned int FLAG_NONE = 0U;
 const unsigned int FLAG_READ_FPCR = 1U << 0;
@@ -671,44 +632,6 @@ const char *aarch64_scalar_builtin_types[] = {
   NULL
 };
 
-#define ENTRY(E, M, Q, G) E,
-enum aarch64_simd_type
-{
-#include "aarch64-simd-builtin-types.def"
-  ARM_NEON_H_TYPES_LAST
-};
-#undef ENTRY
-
-struct GTY(()) aarch64_simd_type_info
-{
-  enum aarch64_simd_type type;
-
-  /* Internal type name.  */
-  const char *name;
-
-  /* Internal type name(mangled).  The mangled names conform to the
-     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
-     Appendix A).  To qualify for emission with the mangled names defined in
-     that document, a vector type must not only be of the correct mode but also
-     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
-     types are registered by aarch64_init_simd_builtin_types ().  In other
-     words, vector types defined in other ways e.g. via vector_size attribute
-     will get default mangled names.  */
-  const char *mangle;
-
-  /* Internal type.  */
-  tree itype;
-
-  /* Element type.  */
-  tree eltype;
-
-  /* Machine mode the internal type maps to.  */
-  enum machine_mode mode;
-
-  /* Qualifiers.  */
-  enum aarch64_type_qualifiers q;
-};
-
 #define ENTRY(E, M, Q, G)  \
   {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q},
 static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = {
@@ -2796,6 +2719,14 @@ get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* Return aarch64_simd_type_info corresponding to TYPE.  */
+
+aarch64_simd_type_info
+aarch64_get_simd_info_for_type (enum aarch64_simd_type type)
+{
+  return aarch64_simd_types[type];
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h
new file mode 100644
index 00000000000..b395402379c
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -0,0 +1,85 @@
+#ifndef AARCH64_BUILTINS_H
+#define AARCH64_BUILTINS_H
+
+#define ENTRY(E, M, Q, G) E,
+enum aarch64_simd_type
+{
+#include "aarch64-simd-builtin-types.def"
+  ARM_NEON_H_TYPES_LAST
+};
+#undef ENTRY
+
+enum aarch64_type_qualifiers
+{
+  /* T foo.  */
+  qualifier_none = 0x0,
+  /* unsigned T foo.  */
+  qualifier_unsigned = 0x1, /* 1 << 0  */
+  /* const T foo.  */
+  qualifier_const = 0x2, /* 1 << 1  */
+  /* T *foo.  */
+  qualifier_pointer = 0x4, /* 1 << 2  */
+  /* Used when expanding arguments if an operand could
+     be an immediate.  */
+  qualifier_immediate = 0x8, /* 1 << 3  */
+  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
+  /* void foo (...).  */
+  qualifier_void = 0x20, /* 1 << 5  */
+  /* Some patterns may have internal operands, this qualifier is an
+     instruction to the initialisation code to skip this operand.  */
+  qualifier_internal = 0x40, /* 1 << 6  */
+  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
+     rather than using the type of the operand.  */
+  qualifier_map_mode = 0x80, /* 1 << 7  */
+  /* qualifier_pointer | qualifier_map_mode  */
+  qualifier_pointer_map_mode = 0x84,
+  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
+  qualifier_const_pointer_map_mode = 0x86,
+  /* Polynomial types.  */
+  qualifier_poly = 0x100,
+  /* Lane indices - must be in range, and flipped for bigendian.  */
+  qualifier_lane_index = 0x200,
+  /* Lane indices for single lane structure loads and stores.  */
+  qualifier_struct_load_store_lane_index = 0x400,
+  /* Lane indices selected in pairs. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
+};
+
+struct GTY(()) aarch64_simd_type_info
+{
+  enum aarch64_simd_type type;
+
+  /* Internal type name.  */
+  const char *name;
+
+  /* Internal type name(mangled).  The mangled names conform to the
+     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
+     Appendix A).  To qualify for emission with the mangled names defined in
+     that document, a vector type must not only be of the correct mode but also
+     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
+     types are registered by aarch64_init_simd_builtin_types ().  In other
+     words, vector types defined in other ways e.g. via vector_size attribute
+     will get default mangled names.  */
+  const char *mangle;
+
+  /* Internal type.  */
+  tree itype;
+
+  /* Element type.  */
+  tree eltype;
+
+  /* Machine mode the internal type maps to.  */
+  enum machine_mode mode;
+
+  /* Qualifiers.  */
+  enum aarch64_type_qualifiers q;
+};
+
+aarch64_simd_type_info aarch64_get_simd_info_for_type (enum aarch64_simd_type);
+
+#endif /* AARCH64_BUILTINS_H */
+
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 02e42a71e5e..51e6c1a9cc4 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,14 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "aarch64-builtins.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "tree-ssa-operands.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "value-range.h"
+#include "tree-ssanames.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1215,56 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1] 
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))	
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
+
+	tree tmp = make_ssa_name_fn (cfun, access_type, 0);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
+	tree vectype = TREE_TYPE (mem_ref_lhs);
+	tree lhs_type = TREE_TYPE (lhs);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
+	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f07330cff4f..dc6e5ca1e1d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23009,6 +23009,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
+      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -23169,7 +23198,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+        {
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	  else if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
new file mode 100644
index 00000000000..35100a9e01c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+svint32_t f1 (int32x4_t x)
+{
+  return svld1rq (svptrue_b8 (), &x[0]);
+}
+
+svint32_t f2 (int *x)
+{
+  return svld1rq (svptrue_b8 (), x);
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
  
Prathamesh Kulkarni May 3, 2022, 10:40 a.m. UTC | #3
On Mon, 27 Dec 2021 at 15:54, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > > Hi,
> > > The patch folds:
> > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > > into:
> > > lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> > > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> > >
> > > With patch, for following test:
> > > #include <arm_sve.h>
> > > #include <arm_neon.h>
> > >
> > > svint32_t
> > > foo (int32x4_t x)
> > > {
> > >   return svld1rq (svptrue_b8 (), &x[0]);
> > > }
> > >
> > > it generates following code:
> > > foo:
> > > .LFB4350:
> > >         dup     z0.q, z0.q[0]
> > >         ret
> > >
> > > and passes bootstrap+test on aarch64-linux-gnu.
> > > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > > are correct.
> >
> > Just in case: I was only using int32x4_t in the PR as an example.
> > The same thing should work for all element types.
> >
> > >
> > > Thanks,
> > > Prathamesh
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > index 02e42a71e5e..e21bbec360c 100644
> > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > @@ -1207,6 +1207,56 @@ public:
> > >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> > >      return e.use_contiguous_load_insn (icode);
> > >    }
> > > +
> > > +  gimple *
> > > +  fold (gimple_folder &f) const OVERRIDE
> > > +  {
> > > +    tree arg0 = gimple_call_arg (f.call, 0);
> > > +    tree arg1 = gimple_call_arg (f.call, 1);
> > > +
> > > +    /* Transform:
> > > +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> > > +       into:
> > > +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> > > +       on little endian target.  */
> > > +
> > > +    if (!BYTES_BIG_ENDIAN
> > > +     && integer_all_onesp (arg0)
> > > +     && TREE_CODE (arg1) == ADDR_EXPR)
> > > +      {
> > > +     tree t = TREE_OPERAND (arg1, 0);
> > > +     if (TREE_CODE (t) == ARRAY_REF)
> > > +       {
> > > +         tree index = TREE_OPERAND (t, 1);
> > > +         t = TREE_OPERAND (t, 0);
> > > +         if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > > +           {
> > > +             t = TREE_OPERAND (t, 0);
> > > +             tree vectype = TREE_TYPE (t);
> > > +             if (VECTOR_TYPE_P (vectype)
> > > +                 && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > > +                 && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > > +               {
> >
> > Since this is quite a specific pattern match, and since we now lower
> > arm_neon.h vld1* to normal gimple accesses, I think we should try the
> > “more generally” approach mentioned in the PR and see what the fallout
> > is.  That is, keep:
> >
> >     if (!BYTES_BIG_ENDIAN
> >         && integer_all_onesp (arg0)
> >
> > If those conditions pass, create an Advanced SIMD access at address arg1,
> > using similar code to the handling of:
> >
> >      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
> >      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
> >      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
> >
> > in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> > common code to aarch64.c so that both files can use it.)
> >
> > > +                 tree lhs = gimple_call_lhs (f.call);
> > > +                 tree lhs_type = TREE_TYPE (lhs);
> > > +                 int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > > +                 vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > > +                 for (int i = 0; i < source_nelts; i++)
> > > +                   sel.quick_push (i);
> > > +
> > > +                 vec_perm_indices indices (sel, 1, source_nelts);
> > > +                 if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> > > +                   return NULL;
> >
> > I don't think we need to check this: it should always be true.
> > Probably worth keeping as a gcc_checking_assert though.
> >
> > > +
> > > +                 tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > > +                 return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> > > +               }
> > > +           }
> > > +       }
> > > +      }
> > > +
> > > +    return NULL;
> > > +  }
> > >  };
> > >
> > >  class svld1ro_impl : public load_replicate
> > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > > index f07330cff4f..af27f550be3 100644
> > > --- a/gcc/config/aarch64/aarch64.c
> > > +++ b/gcc/config/aarch64/aarch64.c
> > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> > >
> > >    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> > >    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > > +
> > >    if (d->one_vector_p)
> > > -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > > +    {
> > > +      bool use_dupq = false;
> > > +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
> > > +      if (GET_CODE (sel) == CONST_VECTOR
> > > +       && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > > +       && CONST_VECTOR_DUPLICATE_P (sel))
> > > +       {
> > > +         unsigned nelts = const_vector_encoded_nelts (sel);
> > > +         unsigned i;
> > > +         for (i = 0; i < nelts; i++)
> > > +           {
> > > +             rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> > > +             if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> > > +               break;
> > > +           }
> > > +         if (i == nelts)
> > > +           use_dupq = true;
> > > +       }
> > > +
> > > +      if (use_dupq)
> > > +     aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > > +      else
> > > +     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > > +    }
> >
> > This shouldn't be a TBL but a new operation, handled by its own
> > aarch64_evpc_sve_* routine.  The check for the mask should then
> > be done on d->perm, to detect whether the permutation is one
> > that the new routine supports.
> >
> > I think the requirements are:
> >
> > - !BYTES_BIG_ENDIAN
> > - the source must be an Advanced SIMD vector
> > - the destination must be an SVE vector
> > - the permutation must be a duplicate (tested in the code above)
> > - the number of “patterns” in the permutation must equal the number of
> >   source elements
> > - element X of the permutation must equal X (tested in the code above)
> >
> > The existing aarch64_evpc_* routines expect the source and target modes
> > to be the same, so we should only call them when that's true.
> Hi Richard,
> Thanks for the suggestions, and sorry for late reply.
> Does the following patch look OK (sans the refactoring of building mem_ref) ?
> Passes bootstrap+test on aarch64-linux-gnu.
Hi Richard,
Since stage-1 has reopened, does the attached patch look OK to commit ?

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Richard
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index c21476d7ae9..cfcd9117ce3 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -47,6 +47,7 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "gimple-fold.h"
+#include "aarch64-builtins.h"
 
 #define v8qi_UP  E_V8QImode
 #define v8di_UP  E_V8DImode
@@ -128,46 +129,6 @@
 
 #define SIMD_MAX_BUILTIN_ARGS 5
 
-enum aarch64_type_qualifiers
-{
-  /* T foo.  */
-  qualifier_none = 0x0,
-  /* unsigned T foo.  */
-  qualifier_unsigned = 0x1, /* 1 << 0  */
-  /* const T foo.  */
-  qualifier_const = 0x2, /* 1 << 1  */
-  /* T *foo.  */
-  qualifier_pointer = 0x4, /* 1 << 2  */
-  /* Used when expanding arguments if an operand could
-     be an immediate.  */
-  qualifier_immediate = 0x8, /* 1 << 3  */
-  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
-  /* void foo (...).  */
-  qualifier_void = 0x20, /* 1 << 5  */
-  /* Some patterns may have internal operands, this qualifier is an
-     instruction to the initialisation code to skip this operand.  */
-  qualifier_internal = 0x40, /* 1 << 6  */
-  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
-     rather than using the type of the operand.  */
-  qualifier_map_mode = 0x80, /* 1 << 7  */
-  /* qualifier_pointer | qualifier_map_mode  */
-  qualifier_pointer_map_mode = 0x84,
-  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
-  qualifier_const_pointer_map_mode = 0x86,
-  /* Polynomial types.  */
-  qualifier_poly = 0x100,
-  /* Lane indices - must be in range, and flipped for bigendian.  */
-  qualifier_lane_index = 0x200,
-  /* Lane indices for single lane structure loads and stores.  */
-  qualifier_struct_load_store_lane_index = 0x400,
-  /* Lane indices selected in pairs. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_pair_index = 0x800,
-  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_quadtup_index = 0x1000,
-};
-
 /* Flags that describe what a function might do.  */
 const unsigned int FLAG_NONE = 0U;
 const unsigned int FLAG_READ_FPCR = 1U << 0;
@@ -671,44 +632,6 @@ const char *aarch64_scalar_builtin_types[] = {
   NULL
 };
 
-#define ENTRY(E, M, Q, G) E,
-enum aarch64_simd_type
-{
-#include "aarch64-simd-builtin-types.def"
-  ARM_NEON_H_TYPES_LAST
-};
-#undef ENTRY
-
-struct GTY(()) aarch64_simd_type_info
-{
-  enum aarch64_simd_type type;
-
-  /* Internal type name.  */
-  const char *name;
-
-  /* Internal type name(mangled).  The mangled names conform to the
-     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
-     Appendix A).  To qualify for emission with the mangled names defined in
-     that document, a vector type must not only be of the correct mode but also
-     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
-     types are registered by aarch64_init_simd_builtin_types ().  In other
-     words, vector types defined in other ways e.g. via vector_size attribute
-     will get default mangled names.  */
-  const char *mangle;
-
-  /* Internal type.  */
-  tree itype;
-
-  /* Element type.  */
-  tree eltype;
-
-  /* Machine mode the internal type maps to.  */
-  enum machine_mode mode;
-
-  /* Qualifiers.  */
-  enum aarch64_type_qualifiers q;
-};
-
 #define ENTRY(E, M, Q, G)  \
   {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q},
 static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = {
@@ -2826,6 +2749,14 @@ get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* Return aarch64_simd_type_info corresponding to TYPE.  */
+
+aarch64_simd_type_info
+aarch64_get_simd_info_for_type (enum aarch64_simd_type type)
+{
+  return aarch64_simd_types[type];
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h
new file mode 100644
index 00000000000..4d155566dc5
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -0,0 +1,101 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_BUILTINS_H
+#define AARCH64_BUILTINS_H
+
+#define ENTRY(E, M, Q, G) E,
+enum aarch64_simd_type
+{
+#include "aarch64-simd-builtin-types.def"
+  ARM_NEON_H_TYPES_LAST
+};
+#undef ENTRY
+
+enum aarch64_type_qualifiers
+{
+  /* T foo.  */
+  qualifier_none = 0x0,
+  /* unsigned T foo.  */
+  qualifier_unsigned = 0x1, /* 1 << 0  */
+  /* const T foo.  */
+  qualifier_const = 0x2, /* 1 << 1  */
+  /* T *foo.  */
+  qualifier_pointer = 0x4, /* 1 << 2  */
+  /* Used when expanding arguments if an operand could
+     be an immediate.  */
+  qualifier_immediate = 0x8, /* 1 << 3  */
+  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
+  /* void foo (...).  */
+  qualifier_void = 0x20, /* 1 << 5  */
+  /* Some patterns may have internal operands, this qualifier is an
+     instruction to the initialisation code to skip this operand.  */
+  qualifier_internal = 0x40, /* 1 << 6  */
+  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
+     rather than using the type of the operand.  */
+  qualifier_map_mode = 0x80, /* 1 << 7  */
+  /* qualifier_pointer | qualifier_map_mode  */
+  qualifier_pointer_map_mode = 0x84,
+  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
+  qualifier_const_pointer_map_mode = 0x86,
+  /* Polynomial types.  */
+  qualifier_poly = 0x100,
+  /* Lane indices - must be in range, and flipped for bigendian.  */
+  qualifier_lane_index = 0x200,
+  /* Lane indices for single lane structure loads and stores.  */
+  qualifier_struct_load_store_lane_index = 0x400,
+  /* Lane indices selected in pairs. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
+};
+
+struct GTY(()) aarch64_simd_type_info
+{
+  enum aarch64_simd_type type;
+
+  /* Internal type name.  */
+  const char *name;
+
+  /* Internal type name(mangled).  The mangled names conform to the
+     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
+     Appendix A).  To qualify for emission with the mangled names defined in
+     that document, a vector type must not only be of the correct mode but also
+     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
+     types are registered by aarch64_init_simd_builtin_types ().  In other
+     words, vector types defined in other ways e.g. via vector_size attribute
+     will get default mangled names.  */
+  const char *mangle;
+
+  /* Internal type.  */
+  tree itype;
+
+  /* Element type.  */
+  tree eltype;
+
+  /* Machine mode the internal type maps to.  */
+  enum machine_mode mode;
+
+  /* Qualifiers.  */
+  enum aarch64_type_qualifiers q;
+};
+
+aarch64_simd_type_info aarch64_get_simd_info_for_type (enum aarch64_simd_type);
+
+#endif /* AARCH64_BUILTINS_H */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c0548724..1ef4ea2087b 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,14 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "aarch64-builtins.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "tree-ssa-operands.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "value-range.h"
+#include "tree-ssanames.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1215,56 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
+
+	tree tmp = make_ssa_name_fn (cfun, access_type, 0);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
+	tree vectype = TREE_TYPE (mem_ref_lhs);
+	tree lhs_type = TREE_TYPE (lhs);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
+	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce..47810fec804 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
+      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	  else if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
new file mode 100644
index 00000000000..35100a9e01c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+svint32_t f1 (int32x4_t x)
+{
+  return svld1rq (svptrue_b8 (), &x[0]);
+}
+
+svint32_t f2 (int *x)
+{
+  return svld1rq (svptrue_b8 (), x);
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
  
Richard Sandiford May 6, 2022, 10:30 a.m. UTC | #4
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index c24c0548724..1ef4ea2087b 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,14 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "aarch64-builtins.h"
> +#include "gimple-ssa.h"
> +#include "tree-phinodes.h"
> +#include "tree-ssa-operands.h"
> +#include "ssa-iterators.h"
> +#include "stringpool.h"
> +#include "value-range.h"
> +#include "tree-ssanames.h"

Minor, but: I think the preferred approach is to include "ssa.h"
rather than include some of these headers directly.

>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1215,56 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs = gimple_call_lhs (f.call);
> +	auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);

Does this work for other element sizes?  I would have expected it
to be the (128-bit) Advanced SIMD vector associated with the same
element type as the SVE vector.

The testcase should cover more than just int32x4_t -> svint32_t,
just to be sure.

> +
> +	tree elt_ptr_type
> +	  = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> +	tree zero = build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> +
> +	tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> +	gimple *mem_ref_stmt
> +	  = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));

Long line.  Might be easier to format by assigning the fold_build2 result
to a temporary variable.

> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> +	tree vectype = TREE_TYPE (mem_ref_lhs);
> +	tree lhs_type = TREE_TYPE (lhs);

Is this necessary?  The code above supplied the types and I wouldn't
have expected them to change during the build process.

> +
> +	int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> +	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> +	for (int i = 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> +	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);

Nit: long line.

> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f650abbc4ce..47810fec804 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || d->perm.length ().is_constant ()
> +      || !d->one_vector_p
> +      || d->target == NULL
> +      || d->op0 == NULL

These last two lines mean that we always return false for d->testing.
The idea instead is that the return value should be the same for both
d->testing and !d->testing.  The difference is that for !d->testing we
also emit code to do the permute.

> +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()

Sorry, I've forgotten the context now, but: these positive tests
for is_constant surprised me.  Do we really only want to do this
for variable-length SVE code generation, rather than fixed-length?

> +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;

This should happen after the later tests, once we're sure that the
permute vector has the right form.  If the issue is that op0 isn't
provided for testing then I think the hook needs to be passed the
input mode alongside the result mode.

It might then be better to test:

  aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD

(despite what I said earlier, about testing is_constant, sorry).

> +
> +  int npatterns = d->perm.encoding ().npatterns ();
> +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> +    return false;
> +
> +  for (int i = 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>  
>  static bool
> @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>        else if (aarch64_evpc_reencode (d))
>  	return true;
>        if (d->vec_flags == VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> +	{
> +	  if (aarch64_evpc_sve_dup (d))
> +	    return true;
> +	  else if (aarch64_evpc_sve_tbl (d))
> +	    return true;
> +	}
>        else if (d->vec_flags == VEC_ADVSIMD)
>  	return aarch64_evpc_tbl (d);
>      }
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> new file mode 100644
> index 00000000000..35100a9e01c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include "arm_neon.h"
> +#include "arm_sve.h"
> +
> +svint32_t f1 (int32x4_t x)
> +{
> +  return svld1rq (svptrue_b8 (), &x[0]);
> +}
> +
> +svint32_t f2 (int *x)
> +{
> +  return svld1rq (svptrue_b8 (), x);
> +}
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
  
Prathamesh Kulkarni May 11, 2022, 6:24 a.m. UTC | #5
On Fri, 6 May 2022 at 16:00, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index c24c0548724..1ef4ea2087b 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,14 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "aarch64-builtins.h"
> > +#include "gimple-ssa.h"
> > +#include "tree-phinodes.h"
> > +#include "tree-ssa-operands.h"
> > +#include "ssa-iterators.h"
> > +#include "stringpool.h"
> > +#include "value-range.h"
> > +#include "tree-ssanames.h"
>
> Minor, but: I think the preferred approach is to include "ssa.h"
> rather than include some of these headers directly.
>
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1215,56 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> > +       into:
> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> > +       on little endian target.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0))
> > +      {
> > +     tree lhs = gimple_call_lhs (f.call);
> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>
> Does this work for other element sizes?  I would have expected it
> to be the (128-bit) Advanced SIMD vector associated with the same
> element type as the SVE vector.
>
> The testcase should cover more than just int32x4_t -> svint32_t,
> just to be sure.
In the attached patch, it obtains corresponding advsimd type with:

tree eltype = TREE_TYPE (lhs_type);
unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
tree vectype = build_vector_type (eltype, nunits);

While this seems to work with different element sizes, I am not sure if it's
the correct approach ?
>
> > +
> > +     tree elt_ptr_type
> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> > +     tree zero = build_zero_cst (elt_ptr_type);
> > +
> > +     /* Use element type alignment.  */
> > +     tree access_type
> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> > +
> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> > +     gimple *mem_ref_stmt
> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>
> Long line.  Might be easier to format by assigning the fold_build2 result
> to a temporary variable.
>
> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
> > +     tree lhs_type = TREE_TYPE (lhs);
>
> Is this necessary?  The code above supplied the types and I wouldn't
> have expected them to change during the build process.
>
> > +
> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > +     for (int i = 0; i < source_nelts; i++)
> > +       sel.quick_push (i);
> > +
> > +     vec_perm_indices indices (sel, 1, source_nelts);
> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>
> Nit: long line.
>
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index f650abbc4ce..47810fec804 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >    return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +      || d->perm.length ().is_constant ()
> > +      || !d->one_vector_p
> > +      || d->target == NULL
> > +      || d->op0 == NULL
>
> These last two lines mean that we always return false for d->testing.
> The idea instead is that the return value should be the same for both
> d->testing and !d->testing.  The difference is that for !d->testing we
> also emit code to do the permute.
>
> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>
> Sorry, I've forgotten the context now, but: these positive tests
> for is_constant surprised me.  Do we really only want to do this
> for variable-length SVE code generation, rather than fixed-length?
>
> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> > +    return false;
> > +
> > +  if (d->testing_p)
> > +    return true;
>
> This should happen after the later tests, once we're sure that the
> permute vector has the right form.  If the issue is that op0 isn't
> provided for testing then I think the hook needs to be passed the
> input mode alongside the result mode.
>
> It might then be better to test:
>
>   aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD
>
> (despite what I said earlier, about testing is_constant, sorry).
Thanks for the suggestions, I tried to address them in the attached patch.
Does it look OK after bootstrap+test ?

The patch seems to generate the same code for different vector types.
For eg:

svint32_t foo (int32x4_t x)
{
  return svld1rq (svptrue_b8 (), &x[0]);
}

svint16_t foo2(int16x8_t x)
{
  return svld1rq_s16 (svptrue_b8 (), &x[0]);
}

.optimized dump:
;; Function foo (foo, funcdef_no=4350, decl_uid=29928,
cgraph_uid=4351, symbol_order=4350)
svint32_t foo (int32x4_t x)
{
  svint32_t _2;

  <bb 2> [local count: 1073741824]:
  _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, ... }>;
  return _2;

}

;; Function foo2 (foo2, funcdef_no=4351, decl_uid=29931,
cgraph_uid=4352, symbol_order=4351)

svint16_t foo2 (int16x8_t x)
{
  svint16_t _2;

  <bb 2> [local count: 1073741824]:
  _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, 4, 5, 6, 7, ... }>;
  return _2;

}

resulting in code-gen:
foo:
        dup     z0.q, z0.q[0]
        ret

foo2:
        dup     z0.q, z0.q[0]
        ret

I suppose this is correct, since in both cases it's replicating the
entire 128-bit vector (irrespective of element sizes) ?

Thanks,
Prathamesh
>
> > +
> > +  int npatterns = d->perm.encoding ().npatterns ();
> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> > +    return false;
> > +
> > +  for (int i = 0; i < npatterns; i++)
> > +    if (!known_eq (d->perm[i], i))
> > +      return false;
> > +
> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +  return true;
> > +}
> > +
> >  /* Try to implement D using SVE SEL instruction.  */
> >
> >  static bool
> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >        else if (aarch64_evpc_reencode (d))
> >       return true;
> >        if (d->vec_flags == VEC_SVE_DATA)
> > -     return aarch64_evpc_sve_tbl (d);
> > +     {
> > +       if (aarch64_evpc_sve_dup (d))
> > +         return true;
> > +       else if (aarch64_evpc_sve_tbl (d))
> > +         return true;
> > +     }
> >        else if (d->vec_flags == VEC_ADVSIMD)
> >       return aarch64_evpc_tbl (d);
> >      }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> > new file mode 100644
> > index 00000000000..35100a9e01c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include "arm_neon.h"
> > +#include "arm_sve.h"
> > +
> > +svint32_t f1 (int32x4_t x)
> > +{
> > +  return svld1rq (svptrue_b8 (), &x[0]);
> > +}
> > +
> > +svint32_t f2 (int *x)
> > +{
> > +  return svld1rq (svptrue_b8 (), x);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c0548724..8a2e5b886e4 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1208,59 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	tree eltype = TREE_TYPE (lhs_type);
+	unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
+	tree vectype = build_vector_type (eltype, nunits);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   indices));
+	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce..072ec9bd153 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || (aarch64_classify_vector_mode (GET_MODE (d->target)) & VEC_ANY_SVE) == 0
+      || (aarch64_classify_vector_mode (GET_MODE (d->op0)) & VEC_ADVSIMD) == 0)
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	  else if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
new file mode 100644
index 00000000000..5af3b6ed24c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8x16_t, s8)
+TEST(svint16_t, int16x8_t, s16)
+TEST(svint32_t, int32x4_t, s32)
+TEST(svint64_t, int64x2_t, s64)
+
+TEST(svuint8_t, uint8x16_t, u8)
+TEST(svuint16_t, uint16x8_t, u16)
+TEST(svuint32_t, uint32x4_t, u32)
+TEST(svuint64_t, uint64x2_t, u64)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
new file mode 100644
index 00000000000..17e78c57c1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type *x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8_t, s8)
+TEST(svint16_t, int16_t, s16)
+TEST(svint32_t, int32_t, s32)
+TEST(svint64_t, int64_t, s64)
+
+TEST(svuint8_t, uint8_t, u8)
+TEST(svuint16_t, uint16_t, u16)
+TEST(svuint32_t, uint32_t, u32)
+TEST(svuint64_t, uint64_t, u64)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
  
Richard Sandiford May 11, 2022, 7:14 a.m. UTC | #6
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Fri, 6 May 2022 at 16:00, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index c24c0548724..1ef4ea2087b 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -44,6 +44,14 @@
>> >  #include "aarch64-sve-builtins-shapes.h"
>> >  #include "aarch64-sve-builtins-base.h"
>> >  #include "aarch64-sve-builtins-functions.h"
>> > +#include "aarch64-builtins.h"
>> > +#include "gimple-ssa.h"
>> > +#include "tree-phinodes.h"
>> > +#include "tree-ssa-operands.h"
>> > +#include "ssa-iterators.h"
>> > +#include "stringpool.h"
>> > +#include "value-range.h"
>> > +#include "tree-ssanames.h"
>>
>> Minor, but: I think the preferred approach is to include "ssa.h"
>> rather than include some of these headers directly.
>>
>> >
>> >  using namespace aarch64_sve;
>> >
>> > @@ -1207,6 +1215,56 @@ public:
>> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >      return e.use_contiguous_load_insn (icode);
>> >    }
>> > +
>> > +  gimple *
>> > +  fold (gimple_folder &f) const OVERRIDE
>> > +  {
>> > +    tree arg0 = gimple_call_arg (f.call, 0);
>> > +    tree arg1 = gimple_call_arg (f.call, 1);
>> > +
>> > +    /* Transform:
>> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
>> > +       into:
>> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
>> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
>> > +       on little endian target.  */
>> > +
>> > +    if (!BYTES_BIG_ENDIAN
>> > +     && integer_all_onesp (arg0))
>> > +      {
>> > +     tree lhs = gimple_call_lhs (f.call);
>> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>>
>> Does this work for other element sizes?  I would have expected it
>> to be the (128-bit) Advanced SIMD vector associated with the same
>> element type as the SVE vector.
>>
>> The testcase should cover more than just int32x4_t -> svint32_t,
>> just to be sure.
> In the attached patch, it obtains corresponding advsimd type with:
>
> tree eltype = TREE_TYPE (lhs_type);
> unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> tree vectype = build_vector_type (eltype, nunits);
>
> While this seems to work with different element sizes, I am not sure if it's
> the correct approach ?

Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
to get the vector mode associated with a .Q “element”, so an
alternative would be:

    machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
    tree vectype = build_vector_type_for_mode (eltype, vq_mode);

which is more explicit about wanting an Advanced SIMD vector.

>> > +
>> > +     tree elt_ptr_type
>> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
>> > +     tree zero = build_zero_cst (elt_ptr_type);
>> > +
>> > +     /* Use element type alignment.  */
>> > +     tree access_type
>> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
>> > +
>> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> > +     gimple *mem_ref_stmt
>> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>>
>> Long line.  Might be easier to format by assigning the fold_build2 result
>> to a temporary variable.
>>
>> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> > +
>> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
>> > +     tree lhs_type = TREE_TYPE (lhs);
>>
>> Is this necessary?  The code above supplied the types and I wouldn't
>> have expected them to change during the build process.
>>
>> > +
>> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
>> > +     for (int i = 0; i < source_nelts; i++)
>> > +       sel.quick_push (i);
>> > +
>> > +     vec_perm_indices indices (sel, 1, source_nelts);
>> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
>> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>>
>> Nit: long line.
>>
>> > +      }
>> > +
>> > +    return NULL;
>> > +  }
>> >  };
>> >
>> >  class svld1ro_impl : public load_replicate
>> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > index f650abbc4ce..47810fec804 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >    return true;
>> >  }
>> >
>> > +/* Try to implement D using SVE dup instruction.  */
>> > +
>> > +static bool
>> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> > +{
>> > +  if (BYTES_BIG_ENDIAN
>> > +      || d->perm.length ().is_constant ()
>> > +      || !d->one_vector_p
>> > +      || d->target == NULL
>> > +      || d->op0 == NULL
>>
>> These last two lines mean that we always return false for d->testing.
>> The idea instead is that the return value should be the same for both
>> d->testing and !d->testing.  The difference is that for !d->testing we
>> also emit code to do the permute.

It doesn't look like the new patch addresses this.  There should be
no checks for/uses of “d->target” and “d->op0” until after:

  if (d->testing_p)
    return true;

This...  

>> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>>
>> Sorry, I've forgotten the context now, but: these positive tests
>> for is_constant surprised me.  Do we really only want to do this
>> for variable-length SVE code generation, rather than fixed-length?
>>
>> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
>> > +    return false;
>> > +
>> > +  if (d->testing_p)
>> > +    return true;
>>
>> This should happen after the later tests, once we're sure that the
>> permute vector has the right form.  If the issue is that op0 isn't
>> provided for testing then I think the hook needs to be passed the
>> input mode alongside the result mode.

...was my guess about why the checks were there.

>> It might then be better to test:
>>
>>   aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD
>>
>> (despite what I said earlier, about testing is_constant, sorry).
> Thanks for the suggestions, I tried to address them in the attached patch.
> Does it look OK after bootstrap+test ?
>
> The patch seems to generate the same code for different vector types.
> For eg:
>
> svint32_t foo (int32x4_t x)
> {
>   return svld1rq (svptrue_b8 (), &x[0]);
> }
>
> svint16_t foo2(int16x8_t x)
> {
>   return svld1rq_s16 (svptrue_b8 (), &x[0]);
> }
>
> .optimized dump:
> ;; Function foo (foo, funcdef_no=4350, decl_uid=29928,
> cgraph_uid=4351, symbol_order=4350)
> svint32_t foo (int32x4_t x)
> {
>   svint32_t _2;
>
>   <bb 2> [local count: 1073741824]:
>   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, ... }>;
>   return _2;
>
> }
>
> ;; Function foo2 (foo2, funcdef_no=4351, decl_uid=29931,
> cgraph_uid=4352, symbol_order=4351)
>
> svint16_t foo2 (int16x8_t x)
> {
>   svint16_t _2;
>
>   <bb 2> [local count: 1073741824]:
>   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, 4, 5, 6, 7, ... }>;
>   return _2;
>
> }
>
> resulting in code-gen:
> foo:
>         dup     z0.q, z0.q[0]
>         ret
>
> foo2:
>         dup     z0.q, z0.q[0]
>         ret
>
> I suppose this is correct, since in both cases it's replicating the
> entire 128-bit vector (irrespective of element sizes) ?

Yeah, the output code will be the same for all cases.

> Thanks,
> Prathamesh
>>
>> > +
>> > +  int npatterns = d->perm.encoding ().npatterns ();
>> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
>> > +    return false;
>> > +
>> > +  for (int i = 0; i < npatterns; i++)
>> > +    if (!known_eq (d->perm[i], i))
>> > +      return false;
>> > +
>> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
>> > +  return true;
>> > +}
>> > +
>> >  /* Try to implement D using SVE SEL instruction.  */
>> >
>> >  static bool
>> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>> >        else if (aarch64_evpc_reencode (d))
>> >       return true;
>> >        if (d->vec_flags == VEC_SVE_DATA)
>> > -     return aarch64_evpc_sve_tbl (d);
>> > +     {
>> > +       if (aarch64_evpc_sve_dup (d))
>> > +         return true;
>> > +       else if (aarch64_evpc_sve_tbl (d))
>> > +         return true;
>> > +     }
>> >        else if (d->vec_flags == VEC_ADVSIMD)
>> >       return aarch64_evpc_tbl (d);
>> >      }
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
>> > new file mode 100644
>> > index 00000000000..35100a9e01c
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
>> > @@ -0,0 +1,17 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3" } */
>> > +
>> > +#include "arm_neon.h"
>> > +#include "arm_sve.h"
>> > +
>> > +svint32_t f1 (int32x4_t x)
>> > +{
>> > +  return svld1rq (svptrue_b8 (), &x[0]);
>> > +}
>> > +
>> > +svint32_t f2 (int *x)
>> > +{
>> > +  return svld1rq (svptrue_b8 (), x);
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index c24c0548724..8a2e5b886e4 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,7 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "ssa.h"
>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1208,59 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.
> +       vectype is the corresponding ADVSIMD type.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs = gimple_call_lhs (f.call);
> +	tree lhs_type = TREE_TYPE (lhs);
> +	tree eltype = TREE_TYPE (lhs_type);
> +	unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> +	tree vectype = build_vector_type (eltype, nunits);
> +
> +	tree elt_ptr_type
> +	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
> +	tree zero = build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> +
> +	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> +	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> +	gimple *mem_ref_stmt
> +	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> +	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> +	for (int i = 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> +						   indices));
> +	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR,
> +				    mem_ref_lhs, mem_ref_lhs, mask);
> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f650abbc4ce..072ec9bd153 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || d->perm.length ().is_constant ()
> +      || !d->one_vector_p
> +      || d->target == NULL
> +      || d->op0 == NULL
> +      || (aarch64_classify_vector_mode (GET_MODE (d->target)) & VEC_ANY_SVE) == 0

This check isn't necessary, since the caller has already checked that
this is an SVE permute.

> +      || (aarch64_classify_vector_mode (GET_MODE (d->op0)) & VEC_ADVSIMD) == 0)
> +    return false;
> +
> +  int npatterns = d->perm.encoding ().npatterns ();
> +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> +    return false;
> +
> +  for (int i = 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>  
>  static bool
> @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>        else if (aarch64_evpc_reencode (d))
>  	return true;
>        if (d->vec_flags == VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> +	{
> +	  if (aarch64_evpc_sve_dup (d))
> +	    return true;
> +	  else if (aarch64_evpc_sve_tbl (d))
> +	    return true;
> +	}
>        else if (d->vec_flags == VEC_ADVSIMD)
>  	return aarch64_evpc_tbl (d);
>      }
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> new file mode 100644
> index 00000000000..5af3b6ed24c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include "arm_neon.h"
> +#include "arm_sve.h"
> +
> +#define TEST(ret_type, param_type, suffix) \
> +ret_type test_##suffix(param_type x) \
> +{ \
> +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> +}
> +
> +TEST(svint8_t, int8x16_t, s8)
> +TEST(svint16_t, int16x8_t, s16)
> +TEST(svint32_t, int32x4_t, s32)
> +TEST(svint64_t, int64x2_t, s64)
> +
> +TEST(svuint8_t, uint8x16_t, u8)
> +TEST(svuint16_t, uint16x8_t, u16)
> +TEST(svuint32_t, uint32x4_t, u32)
> +TEST(svuint64_t, uint64x2_t, u64)
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> new file mode 100644
> index 00000000000..17e78c57c1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include "arm_neon.h"
> +#include "arm_sve.h"
> +
> +#define TEST(ret_type, param_type, suffix) \
> +ret_type test_##suffix(param_type *x) \
> +{ \
> +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> +}
> +
> +TEST(svint8_t, int8_t, s8)
> +TEST(svint16_t, int16_t, s16)
> +TEST(svint32_t, int32_t, s32)
> +TEST(svint64_t, int64_t, s64)
> +
> +TEST(svuint8_t, uint8_t, u8)
> +TEST(svuint16_t, uint16_t, u16)
> +TEST(svuint32_t, uint32_t, u32)
> +TEST(svuint64_t, uint64_t, u64)
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */

It would be good to check the float modes too.

Thanks,
Richard
  
Prathamesh Kulkarni May 12, 2022, 9:12 a.m. UTC | #7
On Wed, 11 May 2022 at 12:44, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > index c24c0548724..1ef4ea2087b 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > @@ -44,6 +44,14 @@
> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >  #include "aarch64-sve-builtins-base.h"
> >> >  #include "aarch64-sve-builtins-functions.h"
> >> > +#include "aarch64-builtins.h"
> >> > +#include "gimple-ssa.h"
> >> > +#include "tree-phinodes.h"
> >> > +#include "tree-ssa-operands.h"
> >> > +#include "ssa-iterators.h"
> >> > +#include "stringpool.h"
> >> > +#include "value-range.h"
> >> > +#include "tree-ssanames.h"
> >>
> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> rather than include some of these headers directly.
> >>
> >> >
> >> >  using namespace aarch64_sve;
> >> >
> >> > @@ -1207,6 +1215,56 @@ public:
> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >> >      return e.use_contiguous_load_insn (icode);
> >> >    }
> >> > +
> >> > +  gimple *
> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> > +  {
> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
> >> > +
> >> > +    /* Transform:
> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> >> > +       into:
> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> >> > +       on little endian target.  */
> >> > +
> >> > +    if (!BYTES_BIG_ENDIAN
> >> > +     && integer_all_onesp (arg0))
> >> > +      {
> >> > +     tree lhs = gimple_call_lhs (f.call);
> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >>
> >> Does this work for other element sizes?  I would have expected it
> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> element type as the SVE vector.
> >>
> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> just to be sure.
> > In the attached patch, it obtains corresponding advsimd type with:
> >
> > tree eltype = TREE_TYPE (lhs_type);
> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> > tree vectype = build_vector_type (eltype, nunits);
> >
> > While this seems to work with different element sizes, I am not sure if it's
> > the correct approach ?
>
> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> to get the vector mode associated with a .Q “element”, so an
> alternative would be:
>
>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>
> which is more explicit about wanting an Advanced SIMD vector.
>
> >> > +
> >> > +     tree elt_ptr_type
> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> >> > +     tree zero = build_zero_cst (elt_ptr_type);
> >> > +
> >> > +     /* Use element type alignment.  */
> >> > +     tree access_type
> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> >> > +
> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> > +     gimple *mem_ref_stmt
> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
> >>
> >> Long line.  Might be easier to format by assigning the fold_build2 result
> >> to a temporary variable.
> >>
> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> > +
> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
> >> > +     tree lhs_type = TREE_TYPE (lhs);
> >>
> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> have expected them to change during the build process.
> >>
> >> > +
> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> >> > +     for (int i = 0; i < source_nelts; i++)
> >> > +       sel.quick_push (i);
> >> > +
> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
> >>
> >> Nit: long line.
> >>
> >> > +      }
> >> > +
> >> > +    return NULL;
> >> > +  }
> >> >  };
> >> >
> >> >  class svld1ro_impl : public load_replicate
> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> > index f650abbc4ce..47810fec804 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >> >    return true;
> >> >  }
> >> >
> >> > +/* Try to implement D using SVE dup instruction.  */
> >> > +
> >> > +static bool
> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> >> > +{
> >> > +  if (BYTES_BIG_ENDIAN
> >> > +      || d->perm.length ().is_constant ()
> >> > +      || !d->one_vector_p
> >> > +      || d->target == NULL
> >> > +      || d->op0 == NULL
> >>
> >> These last two lines mean that we always return false for d->testing.
> >> The idea instead is that the return value should be the same for both
> >> d->testing and !d->testing.  The difference is that for !d->testing we
> >> also emit code to do the permute.
>
> It doesn't look like the new patch addresses this.  There should be
> no checks for/uses of “d->target” and “d->op0” until after:
>
>   if (d->testing_p)
>     return true;
>
> This...
>
> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
> >>
> >> Sorry, I've forgotten the context now, but: these positive tests
> >> for is_constant surprised me.  Do we really only want to do this
> >> for variable-length SVE code generation, rather than fixed-length?
> >>
> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> >> > +    return false;
> >> > +
> >> > +  if (d->testing_p)
> >> > +    return true;
> >>
> >> This should happen after the later tests, once we're sure that the
> >> permute vector has the right form.  If the issue is that op0 isn't
> >> provided for testing then I think the hook needs to be passed the
> >> input mode alongside the result mode.
>
> ...was my guess about why the checks were there.
Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
In that case, how do we obtain input mode ?

Thanks,
Prathamesh

>
> >> It might then be better to test:
> >>
> >>   aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD
> >>
> >> (despite what I said earlier, about testing is_constant, sorry).
> > Thanks for the suggestions, I tried to address them in the attached patch.
> > Does it look OK after bootstrap+test ?
> >
> > The patch seems to generate the same code for different vector types.
> > For eg:
> >
> > svint32_t foo (int32x4_t x)
> > {
> >   return svld1rq (svptrue_b8 (), &x[0]);
> > }
> >
> > svint16_t foo2(int16x8_t x)
> > {
> >   return svld1rq_s16 (svptrue_b8 (), &x[0]);
> > }
> >
> > .optimized dump:
> > ;; Function foo (foo, funcdef_no=4350, decl_uid=29928,
> > cgraph_uid=4351, symbol_order=4350)
> > svint32_t foo (int32x4_t x)
> > {
> >   svint32_t _2;
> >
> >   <bb 2> [local count: 1073741824]:
> >   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, ... }>;
> >   return _2;
> >
> > }
> >
> > ;; Function foo2 (foo2, funcdef_no=4351, decl_uid=29931,
> > cgraph_uid=4352, symbol_order=4351)
> >
> > svint16_t foo2 (int16x8_t x)
> > {
> >   svint16_t _2;
> >
> >   <bb 2> [local count: 1073741824]:
> >   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, 4, 5, 6, 7, ... }>;
> >   return _2;
> >
> > }
> >
> > resulting in code-gen:
> > foo:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > foo2:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > I suppose this is correct, since in both cases it's replicating the
> > entire 128-bit vector (irrespective of element sizes) ?
>
> Yeah, the output code will be the same for all cases.
>
> > Thanks,
> > Prathamesh
> >>
> >> > +
> >> > +  int npatterns = d->perm.encoding ().npatterns ();
> >> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> >> > +    return false;
> >> > +
> >> > +  for (int i = 0; i < npatterns; i++)
> >> > +    if (!known_eq (d->perm[i], i))
> >> > +      return false;
> >> > +
> >> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> >> > +  return true;
> >> > +}
> >> > +
> >> >  /* Try to implement D using SVE SEL instruction.  */
> >> >
> >> >  static bool
> >> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >> >        else if (aarch64_evpc_reencode (d))
> >> >       return true;
> >> >        if (d->vec_flags == VEC_SVE_DATA)
> >> > -     return aarch64_evpc_sve_tbl (d);
> >> > +     {
> >> > +       if (aarch64_evpc_sve_dup (d))
> >> > +         return true;
> >> > +       else if (aarch64_evpc_sve_tbl (d))
> >> > +         return true;
> >> > +     }
> >> >        else if (d->vec_flags == VEC_ADVSIMD)
> >> >       return aarch64_evpc_tbl (d);
> >> >      }
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> >> > new file mode 100644
> >> > index 00000000000..35100a9e01c
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> >> > @@ -0,0 +1,17 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3" } */
> >> > +
> >> > +#include "arm_neon.h"
> >> > +#include "arm_sve.h"
> >> > +
> >> > +svint32_t f1 (int32x4_t x)
> >> > +{
> >> > +  return svld1rq (svptrue_b8 (), &x[0]);
> >> > +}
> >> > +
> >> > +svint32_t f2 (int *x)
> >> > +{
> >> > +  return svld1rq (svptrue_b8 (), x);
> >> > +}
> >> > +
> >> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index c24c0548724..8a2e5b886e4 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,7 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "ssa.h"
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1208,59 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> > +       into:
> > +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> > +       on little endian target.
> > +       vectype is the corresponding ADVSIMD type.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0))
> > +      {
> > +     tree lhs = gimple_call_lhs (f.call);
> > +     tree lhs_type = TREE_TYPE (lhs);
> > +     tree eltype = TREE_TYPE (lhs_type);
> > +     unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> > +     tree vectype = build_vector_type (eltype, nunits);
> > +
> > +     tree elt_ptr_type
> > +       = build_pointer_type_for_mode (eltype, VOIDmode, true);
> > +     tree zero = build_zero_cst (elt_ptr_type);
> > +
> > +     /* Use element type alignment.  */
> > +     tree access_type
> > +       = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> > +
> > +     tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> > +     tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> > +     gimple *mem_ref_stmt
> > +       = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > +     for (int i = 0; i < source_nelts; i++)
> > +       sel.quick_push (i);
> > +
> > +     vec_perm_indices indices (sel, 1, source_nelts);
> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> > +                                                indices));
> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR,
> > +                                 mem_ref_lhs, mem_ref_lhs, mask);
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index f650abbc4ce..072ec9bd153 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >    return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +      || d->perm.length ().is_constant ()
> > +      || !d->one_vector_p
> > +      || d->target == NULL
> > +      || d->op0 == NULL
> > +      || (aarch64_classify_vector_mode (GET_MODE (d->target)) & VEC_ANY_SVE) == 0
>
> This check isn't necessary, since the caller has already checked that
> this is an SVE permute.
>
> > +      || (aarch64_classify_vector_mode (GET_MODE (d->op0)) & VEC_ADVSIMD) == 0)
> > +    return false;
> > +
> > +  int npatterns = d->perm.encoding ().npatterns ();
> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> > +    return false;
> > +
> > +  for (int i = 0; i < npatterns; i++)
> > +    if (!known_eq (d->perm[i], i))
> > +      return false;
> > +
> > +  if (d->testing_p)
> > +    return true;
> > +
> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +  return true;
> > +}
> > +
> >  /* Try to implement D using SVE SEL instruction.  */
> >
> >  static bool
> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >        else if (aarch64_evpc_reencode (d))
> >       return true;
> >        if (d->vec_flags == VEC_SVE_DATA)
> > -     return aarch64_evpc_sve_tbl (d);
> > +     {
> > +       if (aarch64_evpc_sve_dup (d))
> > +         return true;
> > +       else if (aarch64_evpc_sve_tbl (d))
> > +         return true;
> > +     }
> >        else if (d->vec_flags == VEC_ADVSIMD)
> >       return aarch64_evpc_tbl (d);
> >      }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> > new file mode 100644
> > index 00000000000..5af3b6ed24c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> > @@ -0,0 +1,23 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include "arm_neon.h"
> > +#include "arm_sve.h"
> > +
> > +#define TEST(ret_type, param_type, suffix) \
> > +ret_type test_##suffix(param_type x) \
> > +{ \
> > +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> > +}
> > +
> > +TEST(svint8_t, int8x16_t, s8)
> > +TEST(svint16_t, int16x8_t, s16)
> > +TEST(svint32_t, int32x4_t, s32)
> > +TEST(svint64_t, int64x2_t, s64)
> > +
> > +TEST(svuint8_t, uint8x16_t, u8)
> > +TEST(svuint16_t, uint16x8_t, u16)
> > +TEST(svuint32_t, uint32x4_t, u32)
> > +TEST(svuint64_t, uint64x2_t, u64)
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> > new file mode 100644
> > index 00000000000..17e78c57c1b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> > @@ -0,0 +1,23 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include "arm_neon.h"
> > +#include "arm_sve.h"
> > +
> > +#define TEST(ret_type, param_type, suffix) \
> > +ret_type test_##suffix(param_type *x) \
> > +{ \
> > +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> > +}
> > +
> > +TEST(svint8_t, int8_t, s8)
> > +TEST(svint16_t, int16_t, s16)
> > +TEST(svint32_t, int32_t, s32)
> > +TEST(svint64_t, int64_t, s64)
> > +
> > +TEST(svuint8_t, uint8_t, u8)
> > +TEST(svuint16_t, uint16_t, u16)
> > +TEST(svuint32_t, uint32_t, u32)
> > +TEST(svuint64_t, uint64_t, u64)
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
>
> It would be good to check the float modes too.
>
> Thanks,
> Richard
  
Richard Sandiford May 12, 2022, 10:44 a.m. UTC | #8
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Wed, 11 May 2022 at 12:44, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > index c24c0548724..1ef4ea2087b 100644
>> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > @@ -44,6 +44,14 @@
>> >> >  #include "aarch64-sve-builtins-shapes.h"
>> >> >  #include "aarch64-sve-builtins-base.h"
>> >> >  #include "aarch64-sve-builtins-functions.h"
>> >> > +#include "aarch64-builtins.h"
>> >> > +#include "gimple-ssa.h"
>> >> > +#include "tree-phinodes.h"
>> >> > +#include "tree-ssa-operands.h"
>> >> > +#include "ssa-iterators.h"
>> >> > +#include "stringpool.h"
>> >> > +#include "value-range.h"
>> >> > +#include "tree-ssanames.h"
>> >>
>> >> Minor, but: I think the preferred approach is to include "ssa.h"
>> >> rather than include some of these headers directly.
>> >>
>> >> >
>> >> >  using namespace aarch64_sve;
>> >> >
>> >> > @@ -1207,6 +1215,56 @@ public:
>> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >> >      return e.use_contiguous_load_insn (icode);
>> >> >    }
>> >> > +
>> >> > +  gimple *
>> >> > +  fold (gimple_folder &f) const OVERRIDE
>> >> > +  {
>> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
>> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
>> >> > +
>> >> > +    /* Transform:
>> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
>> >> > +       into:
>> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
>> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
>> >> > +       on little endian target.  */
>> >> > +
>> >> > +    if (!BYTES_BIG_ENDIAN
>> >> > +     && integer_all_onesp (arg0))
>> >> > +      {
>> >> > +     tree lhs = gimple_call_lhs (f.call);
>> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>> >>
>> >> Does this work for other element sizes?  I would have expected it
>> >> to be the (128-bit) Advanced SIMD vector associated with the same
>> >> element type as the SVE vector.
>> >>
>> >> The testcase should cover more than just int32x4_t -> svint32_t,
>> >> just to be sure.
>> > In the attached patch, it obtains corresponding advsimd type with:
>> >
>> > tree eltype = TREE_TYPE (lhs_type);
>> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
>> > tree vectype = build_vector_type (eltype, nunits);
>> >
>> > While this seems to work with different element sizes, I am not sure if it's
>> > the correct approach ?
>>
>> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
>> to get the vector mode associated with a .Q “element”, so an
>> alternative would be:
>>
>>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
>>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>>
>> which is more explicit about wanting an Advanced SIMD vector.
>>
>> >> > +
>> >> > +     tree elt_ptr_type
>> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
>> >> > +     tree zero = build_zero_cst (elt_ptr_type);
>> >> > +
>> >> > +     /* Use element type alignment.  */
>> >> > +     tree access_type
>> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
>> >> > +
>> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> >> > +     gimple *mem_ref_stmt
>> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>> >>
>> >> Long line.  Might be easier to format by assigning the fold_build2 result
>> >> to a temporary variable.
>> >>
>> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> >> > +
>> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
>> >> > +     tree lhs_type = TREE_TYPE (lhs);
>> >>
>> >> Is this necessary?  The code above supplied the types and I wouldn't
>> >> have expected them to change during the build process.
>> >>
>> >> > +
>> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
>> >> > +     for (int i = 0; i < source_nelts; i++)
>> >> > +       sel.quick_push (i);
>> >> > +
>> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
>> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
>> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>> >>
>> >> Nit: long line.
>> >>
>> >> > +      }
>> >> > +
>> >> > +    return NULL;
>> >> > +  }
>> >> >  };
>> >> >
>> >> >  class svld1ro_impl : public load_replicate
>> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> >> > index f650abbc4ce..47810fec804 100644
>> >> > --- a/gcc/config/aarch64/aarch64.cc
>> >> > +++ b/gcc/config/aarch64/aarch64.cc
>> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >> >    return true;
>> >> >  }
>> >> >
>> >> > +/* Try to implement D using SVE dup instruction.  */
>> >> > +
>> >> > +static bool
>> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> >> > +{
>> >> > +  if (BYTES_BIG_ENDIAN
>> >> > +      || d->perm.length ().is_constant ()
>> >> > +      || !d->one_vector_p
>> >> > +      || d->target == NULL
>> >> > +      || d->op0 == NULL
>> >>
>> >> These last two lines mean that we always return false for d->testing.
>> >> The idea instead is that the return value should be the same for both
>> >> d->testing and !d->testing.  The difference is that for !d->testing we
>> >> also emit code to do the permute.
>>
>> It doesn't look like the new patch addresses this.  There should be
>> no checks for/uses of “d->target” and “d->op0” until after:
>>
>>   if (d->testing_p)
>>     return true;
>>
>> This...
>>
>> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>> >>
>> >> Sorry, I've forgotten the context now, but: these positive tests
>> >> for is_constant surprised me.  Do we really only want to do this
>> >> for variable-length SVE code generation, rather than fixed-length?
>> >>
>> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
>> >> > +    return false;
>> >> > +
>> >> > +  if (d->testing_p)
>> >> > +    return true;
>> >>
>> >> This should happen after the later tests, once we're sure that the
>> >> permute vector has the right form.  If the issue is that op0 isn't
>> >> provided for testing then I think the hook needs to be passed the
>> >> input mode alongside the result mode.
>>
>> ...was my guess about why the checks were there.
> Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
> In that case, how do we obtain input mode ?

Well, like I say, I think we might need to extend the vec_perm_const
hook interface so that it gets passed the input mode, now that that
isn't necessarily the same as the output mode.

It would be good to do that as a separate prepatch, since it would
affect other targets too.  And for safety, that patch should make all
existing implementations of the hook return false if the modes aren't
equal, including for aarch64.  The current patch can then make the
aarch64 hook treat the dup case as an exception.

Thanks,
Richard
  
Prathamesh Kulkarni May 31, 2022, 11:32 a.m. UTC | #9
On Thu, 12 May 2022 at 16:15, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Wed, 11 May 2022 at 12:44, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >>
> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > index c24c0548724..1ef4ea2087b 100644
> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > @@ -44,6 +44,14 @@
> >> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >> >  #include "aarch64-sve-builtins-base.h"
> >> >> >  #include "aarch64-sve-builtins-functions.h"
> >> >> > +#include "aarch64-builtins.h"
> >> >> > +#include "gimple-ssa.h"
> >> >> > +#include "tree-phinodes.h"
> >> >> > +#include "tree-ssa-operands.h"
> >> >> > +#include "ssa-iterators.h"
> >> >> > +#include "stringpool.h"
> >> >> > +#include "value-range.h"
> >> >> > +#include "tree-ssanames.h"
> >> >>
> >> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> >> rather than include some of these headers directly.
> >> >>
> >> >> >
> >> >> >  using namespace aarch64_sve;
> >> >> >
> >> >> > @@ -1207,6 +1215,56 @@ public:
> >> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >> >> >      return e.use_contiguous_load_insn (icode);
> >> >> >    }
> >> >> > +
> >> >> > +  gimple *
> >> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> >> > +  {
> >> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
> >> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
> >> >> > +
> >> >> > +    /* Transform:
> >> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> >> >> > +       into:
> >> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> >> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> >> >> > +       on little endian target.  */
> >> >> > +
> >> >> > +    if (!BYTES_BIG_ENDIAN
> >> >> > +     && integer_all_onesp (arg0))
> >> >> > +      {
> >> >> > +     tree lhs = gimple_call_lhs (f.call);
> >> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >> >>
> >> >> Does this work for other element sizes?  I would have expected it
> >> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> >> element type as the SVE vector.
> >> >>
> >> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> >> just to be sure.
> >> > In the attached patch, it obtains corresponding advsimd type with:
> >> >
> >> > tree eltype = TREE_TYPE (lhs_type);
> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> >> > tree vectype = build_vector_type (eltype, nunits);
> >> >
> >> > While this seems to work with different element sizes, I am not sure if it's
> >> > the correct approach ?
> >>
> >> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> >> to get the vector mode associated with a .Q “element”, so an
> >> alternative would be:
> >>
> >>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
> >>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> >>
> >> which is more explicit about wanting an Advanced SIMD vector.
> >>
> >> >> > +
> >> >> > +     tree elt_ptr_type
> >> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> >> >> > +     tree zero = build_zero_cst (elt_ptr_type);
> >> >> > +
> >> >> > +     /* Use element type alignment.  */
> >> >> > +     tree access_type
> >> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> >> >> > +
> >> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> >> > +     gimple *mem_ref_stmt
> >> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
> >> >>
> >> >> Long line.  Might be easier to format by assigning the fold_build2 result
> >> >> to a temporary variable.
> >> >>
> >> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> >> > +
> >> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
> >> >> > +     tree lhs_type = TREE_TYPE (lhs);
> >> >>
> >> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> >> have expected them to change during the build process.
> >> >>
> >> >> > +
> >> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> >> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> >> >> > +     for (int i = 0; i < source_nelts; i++)
> >> >> > +       sel.quick_push (i);
> >> >> > +
> >> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
> >> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> >> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> >> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
> >> >>
> >> >> Nit: long line.
> >> >>
> >> >> > +      }
> >> >> > +
> >> >> > +    return NULL;
> >> >> > +  }
> >> >> >  };
> >> >> >
> >> >> >  class svld1ro_impl : public load_replicate
> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> >> > index f650abbc4ce..47810fec804 100644
> >> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >> >> >    return true;
> >> >> >  }
> >> >> >
> >> >> > +/* Try to implement D using SVE dup instruction.  */
> >> >> > +
> >> >> > +static bool
> >> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> >> >> > +{
> >> >> > +  if (BYTES_BIG_ENDIAN
> >> >> > +      || d->perm.length ().is_constant ()
> >> >> > +      || !d->one_vector_p
> >> >> > +      || d->target == NULL
> >> >> > +      || d->op0 == NULL
> >> >>
> >> >> These last two lines mean that we always return false for d->testing.
> >> >> The idea instead is that the return value should be the same for both
> >> >> d->testing and !d->testing.  The difference is that for !d->testing we
> >> >> also emit code to do the permute.
> >>
> >> It doesn't look like the new patch addresses this.  There should be
> >> no checks for/uses of “d->target” and “d->op0” until after:
> >>
> >>   if (d->testing_p)
> >>     return true;
> >>
> >> This...
> >>
> >> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
> >> >>
> >> >> Sorry, I've forgotten the context now, but: these positive tests
> >> >> for is_constant surprised me.  Do we really only want to do this
> >> >> for variable-length SVE code generation, rather than fixed-length?
> >> >>
> >> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> >> >> > +    return false;
> >> >> > +
> >> >> > +  if (d->testing_p)
> >> >> > +    return true;
> >> >>
> >> >> This should happen after the later tests, once we're sure that the
> >> >> permute vector has the right form.  If the issue is that op0 isn't
> >> >> provided for testing then I think the hook needs to be passed the
> >> >> input mode alongside the result mode.
> >>
> >> ...was my guess about why the checks were there.
> > Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
> > In that case, how do we obtain input mode ?
>
> Well, like I say, I think we might need to extend the vec_perm_const
> hook interface so that it gets passed the input mode, now that that
> isn't necessarily the same as the output mode.
>
> It would be good to do that as a separate prepatch, since it would
> affect other targets too.  And for safety, that patch should make all
> existing implementations of the hook return false if the modes aren't
> equal, including for aarch64.  The current patch can then make the
> aarch64 hook treat the dup case as an exception.
Hi Richard,
I have attached updated patch, which tries to address above suggestions.
I had a question about couple of things:
(1) The patch resulted in ICE for float operands, because we were
using lhs_type to build mask, which is float vector type.
So I adjusted the patch to make mask vector of integer_type_node with
length == length(lhs_type) if lhs has float vector type.
Does that look OK ?

(2) Moved check for d->vmode != op_mode (and only checking for dup in
that case), inside vec_perm_const_1,
since it does some initial bookkeeping (like swapping operands),
before calling respective functions.
Does that look OK ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index bee410929bd..48e849bec34 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1208,66 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+	tree eltype = TREE_TYPE (lhs_type);
+
+	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (lhs_len, source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   TYPE_MODE (access_type),
+						   indices));
+	tree mask_type = (FLOAT_TYPE_P (eltype))
+			 ? build_vector_type (integer_type_node, lhs_len)
+			 : lhs_type;
+	tree mask = vec_perm_indices_to_tree (mask_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d4c575ce976..ae8e913d525 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23401,7 +23401,8 @@ struct expand_vec_perm_d
   bool testing_p;
 };
 
-static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
+static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d,
+					     machine_mode op_mode);
 
 /* Generate a variable permutation.  */
 
@@ -23638,7 +23639,7 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
   newd.one_vector_p = d->one_vector_p;
 
   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
-  return aarch64_expand_vec_perm_const_1 (&newd);
+  return aarch64_expand_vec_perm_const_1 (&newd, newd.vmode);
 }
 
 /* Recognize patterns suitable for the UZP instructions.  */
@@ -23945,6 +23946,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d, machine_mode op_mode)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || aarch64_classify_vector_mode (op_mode) != VEC_ADVSIMD)
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (op_mode)))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24066,7 +24093,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
 }
 
 static bool
-aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d,
+				 machine_mode op_mode)
 {
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
@@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
        || d->vec_flags == VEC_SVE_PRED)
       && known_gt (nelt, 1))
     {
+      /* If operand and result modes differ, then only check
+	 for dup case.  */
+      if (d->vmode != op_mode)
+	return (d->vec_flags == VEC_SVE_DATA)
+		? aarch64_evpc_sve_dup (d, op_mode) : false;
+
       if (aarch64_evpc_rev_local (d))
 	return true;
       else if (aarch64_evpc_rev_global (d))
@@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+	{
+	  if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	  else if (aarch64_evpc_sve_dup (d, op_mode))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
@@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  rtx target, rtx op0, rtx op1,
 				  const vec_perm_indices &sel)
 {
-  if (vmode != op_mode)
-    return false;
-
   struct expand_vec_perm_d d;
 
   /* Check whether the mask can be applied to a single vector.  */
@@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   d.testing_p = !target;
 
   if (!d.testing_p)
-    return aarch64_expand_vec_perm_const_1 (&d);
+    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
 
   rtx_insn *last = get_last_insn ();
-  bool ret = aarch64_expand_vec_perm_const_1 (&d);
+  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
   gcc_assert (last == get_last_insn ());
 
   return ret;
  
Richard Sandiford June 1, 2022, 8:42 a.m. UTC | #10
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Thu, 12 May 2022 at 16:15, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > On Wed, 11 May 2022 at 12:44, Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
>> >> > <richard.sandiford@arm.com> wrote:
>> >> >>
>> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> >> > index c24c0548724..1ef4ea2087b 100644
>> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> >> > @@ -44,6 +44,14 @@
>> >> >> >  #include "aarch64-sve-builtins-shapes.h"
>> >> >> >  #include "aarch64-sve-builtins-base.h"
>> >> >> >  #include "aarch64-sve-builtins-functions.h"
>> >> >> > +#include "aarch64-builtins.h"
>> >> >> > +#include "gimple-ssa.h"
>> >> >> > +#include "tree-phinodes.h"
>> >> >> > +#include "tree-ssa-operands.h"
>> >> >> > +#include "ssa-iterators.h"
>> >> >> > +#include "stringpool.h"
>> >> >> > +#include "value-range.h"
>> >> >> > +#include "tree-ssanames.h"
>> >> >>
>> >> >> Minor, but: I think the preferred approach is to include "ssa.h"
>> >> >> rather than include some of these headers directly.
>> >> >>
>> >> >> >
>> >> >> >  using namespace aarch64_sve;
>> >> >> >
>> >> >> > @@ -1207,6 +1215,56 @@ public:
>> >> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >> >> >      return e.use_contiguous_load_insn (icode);
>> >> >> >    }
>> >> >> > +
>> >> >> > +  gimple *
>> >> >> > +  fold (gimple_folder &f) const OVERRIDE
>> >> >> > +  {
>> >> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
>> >> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
>> >> >> > +
>> >> >> > +    /* Transform:
>> >> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
>> >> >> > +       into:
>> >> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
>> >> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
>> >> >> > +       on little endian target.  */
>> >> >> > +
>> >> >> > +    if (!BYTES_BIG_ENDIAN
>> >> >> > +     && integer_all_onesp (arg0))
>> >> >> > +      {
>> >> >> > +     tree lhs = gimple_call_lhs (f.call);
>> >> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>> >> >>
>> >> >> Does this work for other element sizes?  I would have expected it
>> >> >> to be the (128-bit) Advanced SIMD vector associated with the same
>> >> >> element type as the SVE vector.
>> >> >>
>> >> >> The testcase should cover more than just int32x4_t -> svint32_t,
>> >> >> just to be sure.
>> >> > In the attached patch, it obtains corresponding advsimd type with:
>> >> >
>> >> > tree eltype = TREE_TYPE (lhs_type);
>> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
>> >> > tree vectype = build_vector_type (eltype, nunits);
>> >> >
>> >> > While this seems to work with different element sizes, I am not sure if it's
>> >> > the correct approach ?
>> >>
>> >> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
>> >> to get the vector mode associated with a .Q “element”, so an
>> >> alternative would be:
>> >>
>> >>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
>> >>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>> >>
>> >> which is more explicit about wanting an Advanced SIMD vector.
>> >>
>> >> >> > +
>> >> >> > +     tree elt_ptr_type
>> >> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
>> >> >> > +     tree zero = build_zero_cst (elt_ptr_type);
>> >> >> > +
>> >> >> > +     /* Use element type alignment.  */
>> >> >> > +     tree access_type
>> >> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
>> >> >> > +
>> >> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> >> >> > +     gimple *mem_ref_stmt
>> >> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>> >> >>
>> >> >> Long line.  Might be easier to format by assigning the fold_build2 result
>> >> >> to a temporary variable.
>> >> >>
>> >> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> >> >> > +
>> >> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> >> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
>> >> >> > +     tree lhs_type = TREE_TYPE (lhs);
>> >> >>
>> >> >> Is this necessary?  The code above supplied the types and I wouldn't
>> >> >> have expected them to change during the build process.
>> >> >>
>> >> >> > +
>> >> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> >> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
>> >> >> > +     for (int i = 0; i < source_nelts; i++)
>> >> >> > +       sel.quick_push (i);
>> >> >> > +
>> >> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
>> >> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
>> >> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> >> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>> >> >>
>> >> >> Nit: long line.
>> >> >>
>> >> >> > +      }
>> >> >> > +
>> >> >> > +    return NULL;
>> >> >> > +  }
>> >> >> >  };
>> >> >> >
>> >> >> >  class svld1ro_impl : public load_replicate
>> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> >> >> > index f650abbc4ce..47810fec804 100644
>> >> >> > --- a/gcc/config/aarch64/aarch64.cc
>> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
>> >> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >> >> >    return true;
>> >> >> >  }
>> >> >> >
>> >> >> > +/* Try to implement D using SVE dup instruction.  */
>> >> >> > +
>> >> >> > +static bool
>> >> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> >> >> > +{
>> >> >> > +  if (BYTES_BIG_ENDIAN
>> >> >> > +      || d->perm.length ().is_constant ()
>> >> >> > +      || !d->one_vector_p
>> >> >> > +      || d->target == NULL
>> >> >> > +      || d->op0 == NULL
>> >> >>
>> >> >> These last two lines mean that we always return false for d->testing.
>> >> >> The idea instead is that the return value should be the same for both
>> >> >> d->testing and !d->testing.  The difference is that for !d->testing we
>> >> >> also emit code to do the permute.
>> >>
>> >> It doesn't look like the new patch addresses this.  There should be
>> >> no checks for/uses of “d->target” and “d->op0” until after:
>> >>
>> >>   if (d->testing_p)
>> >>     return true;
>> >>
>> >> This...
>> >>
>> >> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>> >> >>
>> >> >> Sorry, I've forgotten the context now, but: these positive tests
>> >> >> for is_constant surprised me.  Do we really only want to do this
>> >> >> for variable-length SVE code generation, rather than fixed-length?
>> >> >>
>> >> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
>> >> >> > +    return false;
>> >> >> > +
>> >> >> > +  if (d->testing_p)
>> >> >> > +    return true;
>> >> >>
>> >> >> This should happen after the later tests, once we're sure that the
>> >> >> permute vector has the right form.  If the issue is that op0 isn't
>> >> >> provided for testing then I think the hook needs to be passed the
>> >> >> input mode alongside the result mode.
>> >>
>> >> ...was my guess about why the checks were there.
>> > Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
>> > In that case, how do we obtain input mode ?
>>
>> Well, like I say, I think we might need to extend the vec_perm_const
>> hook interface so that it gets passed the input mode, now that that
>> isn't necessarily the same as the output mode.
>>
>> It would be good to do that as a separate prepatch, since it would
>> affect other targets too.  And for safety, that patch should make all
>> existing implementations of the hook return false if the modes aren't
>> equal, including for aarch64.  The current patch can then make the
>> aarch64 hook treat the dup case as an exception.
> Hi Richard,
> I have attached updated patch, which tries to address above suggestions.
> I had a question about couple of things:
> (1) The patch resulted in ICE for float operands, because we were
> using lhs_type to build mask, which is float vector type.
> So I adjusted the patch to make mask vector of integer_type_node with
> length == length(lhs_type) if lhs has float vector type.
> Does that look OK ?

Let's use:

  build_vector_type (ssizetype, lhs_len)

unconditionally, even for integers.

> (2) Moved check for d->vmode != op_mode (and only checking for dup in
> that case), inside vec_perm_const_1,
> since it does some initial bookkeeping (like swapping operands),
> before calling respective functions.
> Does that look OK ?
>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Richard
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index bee410929bd..48e849bec34 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,7 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "ssa.h"
>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1208,66 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const override
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.
> +       vectype is the corresponding ADVSIMD type.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs = gimple_call_lhs (f.call);
> +	tree lhs_type = TREE_TYPE (lhs);
> +	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> +	tree eltype = TREE_TYPE (lhs_type);
> +
> +	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
> +	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
> +	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> +
> +	tree elt_ptr_type
> +	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
> +	tree zero = build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> +
> +	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> +	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> +	gimple *mem_ref_stmt
> +	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> +	vec_perm_builder sel (lhs_len, source_nelts, 1);
> +	for (int i = 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> +						   TYPE_MODE (access_type),
> +						   indices));
> +	tree mask_type = (FLOAT_TYPE_P (eltype))
> +			 ? build_vector_type (integer_type_node, lhs_len)
> +			 : lhs_type;
> +	tree mask = vec_perm_indices_to_tree (mask_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR,
> +				    mem_ref_lhs, mem_ref_lhs, mask);
> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d4c575ce976..ae8e913d525 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23401,7 +23401,8 @@ struct expand_vec_perm_d
>    bool testing_p;
>  };
>  
> -static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
> +static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d,
> +					     machine_mode op_mode);
>  
>  /* Generate a variable permutation.  */
>  
> @@ -23638,7 +23639,7 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
>    newd.one_vector_p = d->one_vector_p;
>  
>    newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
> -  return aarch64_expand_vec_perm_const_1 (&newd);
> +  return aarch64_expand_vec_perm_const_1 (&newd, newd.vmode);
>  }
>  
>  /* Recognize patterns suitable for the UZP instructions.  */
> @@ -23945,6 +23946,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d, machine_mode op_mode)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || d->perm.length ().is_constant ()

Sorry, I've forgotten: why do we need this is_constant check?

> +      || !d->one_vector_p
> +      || aarch64_classify_vector_mode (op_mode) != VEC_ADVSIMD)
> +    return false;

We need to check that nelts_per_pattern is 1 as well.

> +  int npatterns = d->perm.encoding ().npatterns ();
> +  if (!known_eq (npatterns, GET_MODE_NUNITS (op_mode)))
> +    return false;
> +
> +  for (int i = 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>  
>  static bool
> @@ -24066,7 +24093,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
>  }
>  
>  static bool
> -aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> +aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d,
> +				 machine_mode op_mode)

I think we should add op_mode to expand_vec_perm_d instead.
Let's also add an op_vec_flags to cache the aarch64_classify_vector_mode
result.

>  {
>    /* The pattern matching functions above are written to look for a small
>       number to begin the sequence (0, 1, N/2).  If we begin with an index
> @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>         || d->vec_flags == VEC_SVE_PRED)
>        && known_gt (nelt, 1))
>      {
> +      /* If operand and result modes differ, then only check
> +	 for dup case.  */
> +      if (d->vmode != op_mode)
> +	return (d->vec_flags == VEC_SVE_DATA)
> +		? aarch64_evpc_sve_dup (d, op_mode) : false;
> +

I think it'd be more future-proof to format this as:

    if (d->vmod == d->op_mode)
      {
        …existing code…
      }
    else
      {
        if (aarch64_evpc_sve_dup (d))
          return true;
      }

with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
alongside the op_mode check.  I think we'll be adding more checks here
over time.

>        if (aarch64_evpc_rev_local (d))
>  	return true;
>        else if (aarch64_evpc_rev_global (d))
> @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>        else if (aarch64_evpc_reencode (d))
>  	return true;
>        if (d->vec_flags == VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> +	{
> +	  if (aarch64_evpc_sve_tbl (d))
> +	    return true;
> +	  else if (aarch64_evpc_sve_dup (d, op_mode))
> +	    return true;
> +	}
>        else if (d->vec_flags == VEC_ADVSIMD)
>  	return aarch64_evpc_tbl (d);
>      }

Is this part still needed, given the above?

Thanks,
Richard

> @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>  				  rtx target, rtx op0, rtx op1,
>  				  const vec_perm_indices &sel)
>  {
> -  if (vmode != op_mode)
> -    return false;
> -
>    struct expand_vec_perm_d d;
>  
>    /* Check whether the mask can be applied to a single vector.  */
> @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>    d.testing_p = !target;
>  
>    if (!d.testing_p)
> -    return aarch64_expand_vec_perm_const_1 (&d);
> +    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
>  
>    rtx_insn *last = get_last_insn ();
> -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
> +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
>    gcc_assert (last == get_last_insn ());
>  
>    return ret;
  
Prathamesh Kulkarni June 5, 2022, 10:15 a.m. UTC | #11
On Wed, 1 Jun 2022 at 14:12, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Thu, 12 May 2022 at 16:15, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > On Wed, 11 May 2022 at 12:44, Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >>
> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> >> >> > <richard.sandiford@arm.com> wrote:
> >> >> >>
> >> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> >> > index c24c0548724..1ef4ea2087b 100644
> >> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> >> > @@ -44,6 +44,14 @@
> >> >> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >> >> >  #include "aarch64-sve-builtins-base.h"
> >> >> >> >  #include "aarch64-sve-builtins-functions.h"
> >> >> >> > +#include "aarch64-builtins.h"
> >> >> >> > +#include "gimple-ssa.h"
> >> >> >> > +#include "tree-phinodes.h"
> >> >> >> > +#include "tree-ssa-operands.h"
> >> >> >> > +#include "ssa-iterators.h"
> >> >> >> > +#include "stringpool.h"
> >> >> >> > +#include "value-range.h"
> >> >> >> > +#include "tree-ssanames.h"
> >> >> >>
> >> >> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> >> >> rather than include some of these headers directly.
> >> >> >>
> >> >> >> >
> >> >> >> >  using namespace aarch64_sve;
> >> >> >> >
> >> >> >> > @@ -1207,6 +1215,56 @@ public:
> >> >> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >> >> >> >      return e.use_contiguous_load_insn (icode);
> >> >> >> >    }
> >> >> >> > +
> >> >> >> > +  gimple *
> >> >> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> >> >> > +  {
> >> >> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
> >> >> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
> >> >> >> > +
> >> >> >> > +    /* Transform:
> >> >> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> >> >> >> > +       into:
> >> >> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> >> >> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> >> >> >> > +       on little endian target.  */
> >> >> >> > +
> >> >> >> > +    if (!BYTES_BIG_ENDIAN
> >> >> >> > +     && integer_all_onesp (arg0))
> >> >> >> > +      {
> >> >> >> > +     tree lhs = gimple_call_lhs (f.call);
> >> >> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >> >> >>
> >> >> >> Does this work for other element sizes?  I would have expected it
> >> >> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> >> >> element type as the SVE vector.
> >> >> >>
> >> >> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> >> >> just to be sure.
> >> >> > In the attached patch, it obtains corresponding advsimd type with:
> >> >> >
> >> >> > tree eltype = TREE_TYPE (lhs_type);
> >> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> >> >> > tree vectype = build_vector_type (eltype, nunits);
> >> >> >
> >> >> > While this seems to work with different element sizes, I am not sure if it's
> >> >> > the correct approach ?
> >> >>
> >> >> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> >> >> to get the vector mode associated with a .Q “element”, so an
> >> >> alternative would be:
> >> >>
> >> >>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
> >> >>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> >> >>
> >> >> which is more explicit about wanting an Advanced SIMD vector.
> >> >>
> >> >> >> > +
> >> >> >> > +     tree elt_ptr_type
> >> >> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> >> >> >> > +     tree zero = build_zero_cst (elt_ptr_type);
> >> >> >> > +
> >> >> >> > +     /* Use element type alignment.  */
> >> >> >> > +     tree access_type
> >> >> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> >> >> >> > +
> >> >> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> >> >> > +     gimple *mem_ref_stmt
> >> >> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
> >> >> >>
> >> >> >> Long line.  Might be easier to format by assigning the fold_build2 result
> >> >> >> to a temporary variable.
> >> >> >>
> >> >> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> >> >> > +
> >> >> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> >> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
> >> >> >> > +     tree lhs_type = TREE_TYPE (lhs);
> >> >> >>
> >> >> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> >> >> have expected them to change during the build process.
> >> >> >>
> >> >> >> > +
> >> >> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> >> >> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> >> >> >> > +     for (int i = 0; i < source_nelts; i++)
> >> >> >> > +       sel.quick_push (i);
> >> >> >> > +
> >> >> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
> >> >> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> >> >> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> >> >> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
> >> >> >>
> >> >> >> Nit: long line.
> >> >> >>
> >> >> >> > +      }
> >> >> >> > +
> >> >> >> > +    return NULL;
> >> >> >> > +  }
> >> >> >> >  };
> >> >> >> >
> >> >> >> >  class svld1ro_impl : public load_replicate
> >> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> >> >> > index f650abbc4ce..47810fec804 100644
> >> >> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> >> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >> >> >> >    return true;
> >> >> >> >  }
> >> >> >> >
> >> >> >> > +/* Try to implement D using SVE dup instruction.  */
> >> >> >> > +
> >> >> >> > +static bool
> >> >> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> >> >> >> > +{
> >> >> >> > +  if (BYTES_BIG_ENDIAN
> >> >> >> > +      || d->perm.length ().is_constant ()
> >> >> >> > +      || !d->one_vector_p
> >> >> >> > +      || d->target == NULL
> >> >> >> > +      || d->op0 == NULL
> >> >> >>
> >> >> >> These last two lines mean that we always return false for d->testing.
> >> >> >> The idea instead is that the return value should be the same for both
> >> >> >> d->testing and !d->testing.  The difference is that for !d->testing we
> >> >> >> also emit code to do the permute.
> >> >>
> >> >> It doesn't look like the new patch addresses this.  There should be
> >> >> no checks for/uses of “d->target” and “d->op0” until after:
> >> >>
> >> >>   if (d->testing_p)
> >> >>     return true;
> >> >>
> >> >> This...
> >> >>
> >> >> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
> >> >> >>
> >> >> >> Sorry, I've forgotten the context now, but: these positive tests
> >> >> >> for is_constant surprised me.  Do we really only want to do this
> >> >> >> for variable-length SVE code generation, rather than fixed-length?
> >> >> >>
> >> >> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> >> >> >> > +    return false;
> >> >> >> > +
> >> >> >> > +  if (d->testing_p)
> >> >> >> > +    return true;
> >> >> >>
> >> >> >> This should happen after the later tests, once we're sure that the
> >> >> >> permute vector has the right form.  If the issue is that op0 isn't
> >> >> >> provided for testing then I think the hook needs to be passed the
> >> >> >> input mode alongside the result mode.
> >> >>
> >> >> ...was my guess about why the checks were there.
> >> > Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
> >> > In that case, how do we obtain input mode ?
> >>
> >> Well, like I say, I think we might need to extend the vec_perm_const
> >> hook interface so that it gets passed the input mode, now that that
> >> isn't necessarily the same as the output mode.
> >>
> >> It would be good to do that as a separate prepatch, since it would
> >> affect other targets too.  And for safety, that patch should make all
> >> existing implementations of the hook return false if the modes aren't
> >> equal, including for aarch64.  The current patch can then make the
> >> aarch64 hook treat the dup case as an exception.
> > Hi Richard,
> > I have attached updated patch, which tries to address above suggestions.
> > I had a question about couple of things:
> > (1) The patch resulted in ICE for float operands, because we were
> > using lhs_type to build mask, which is float vector type.
> > So I adjusted the patch to make mask vector of integer_type_node with
> > length == length(lhs_type) if lhs has float vector type.
> > Does that look OK ?
>
> Let's use:
>
>   build_vector_type (ssizetype, lhs_len)
>
> unconditionally, even for integers.
OK thanks, done in attached patch.
>
> > (2) Moved check for d->vmode != op_mode (and only checking for dup in
> > that case), inside vec_perm_const_1,
> > since it does some initial bookkeeping (like swapping operands),
> > before calling respective functions.
> > Does that look OK ?
> >
> > Thanks,
> > Prathamesh
> >>
> >> Thanks,
> >> Richard
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index bee410929bd..48e849bec34 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,7 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "ssa.h"
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1208,66 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const override
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> > +       into:
> > +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> > +       on little endian target.
> > +       vectype is the corresponding ADVSIMD type.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0))
> > +      {
> > +     tree lhs = gimple_call_lhs (f.call);
> > +     tree lhs_type = TREE_TYPE (lhs);
> > +     poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> > +     tree eltype = TREE_TYPE (lhs_type);
> > +
> > +     scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
> > +     machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
> > +     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> > +
> > +     tree elt_ptr_type
> > +       = build_pointer_type_for_mode (eltype, VOIDmode, true);
> > +     tree zero = build_zero_cst (elt_ptr_type);
> > +
> > +     /* Use element type alignment.  */
> > +     tree access_type
> > +       = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> > +
> > +     tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> > +     tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> > +     gimple *mem_ref_stmt
> > +       = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> > +     vec_perm_builder sel (lhs_len, source_nelts, 1);
> > +     for (int i = 0; i < source_nelts; i++)
> > +       sel.quick_push (i);
> > +
> > +     vec_perm_indices indices (sel, 1, source_nelts);
> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> > +                                                TYPE_MODE (access_type),
> > +                                                indices));
> > +     tree mask_type = (FLOAT_TYPE_P (eltype))
> > +                      ? build_vector_type (integer_type_node, lhs_len)
> > +                      : lhs_type;
> > +     tree mask = vec_perm_indices_to_tree (mask_type, indices);
> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR,
> > +                                 mem_ref_lhs, mem_ref_lhs, mask);
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index d4c575ce976..ae8e913d525 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23401,7 +23401,8 @@ struct expand_vec_perm_d
> >    bool testing_p;
> >  };
> >
> > -static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
> > +static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d,
> > +                                          machine_mode op_mode);
> >
> >  /* Generate a variable permutation.  */
> >
> > @@ -23638,7 +23639,7 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
> >    newd.one_vector_p = d->one_vector_p;
> >
> >    newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
> > -  return aarch64_expand_vec_perm_const_1 (&newd);
> > +  return aarch64_expand_vec_perm_const_1 (&newd, newd.vmode);
> >  }
> >
> >  /* Recognize patterns suitable for the UZP instructions.  */
> > @@ -23945,6 +23946,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >    return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d, machine_mode op_mode)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +      || d->perm.length ().is_constant ()
>
> Sorry, I've forgotten: why do we need this is_constant check?
Oh I guess I had put it there, to check if target vector is of
variable length, sorry.
I assume we don't need this. Removed in the attached patch.
>
> > +      || !d->one_vector_p
> > +      || aarch64_classify_vector_mode (op_mode) != VEC_ADVSIMD)
> > +    return false;
>
> We need to check that nelts_per_pattern is 1 as well.
OK thanks, done.
>
> > +  int npatterns = d->perm.encoding ().npatterns ();
> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (op_mode)))
> > +    return false;
> > +
> > +  for (int i = 0; i < npatterns; i++)
> > +    if (!known_eq (d->perm[i], i))
> > +      return false;
> > +
> > +  if (d->testing_p)
> > +    return true;
> > +
> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +  return true;
> > +}
> > +
> >  /* Try to implement D using SVE SEL instruction.  */
> >
> >  static bool
> > @@ -24066,7 +24093,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
> >  }
> >
> >  static bool
> > -aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> > +aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d,
> > +                              machine_mode op_mode)
>
> I think we should add op_mode to expand_vec_perm_d instead.
> Let's also add an op_vec_flags to cache the aarch64_classify_vector_mode
> result.
OK thanks, done.
>
> >  {
> >    /* The pattern matching functions above are written to look for a small
> >       number to begin the sequence (0, 1, N/2).  If we begin with an index
> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >         || d->vec_flags == VEC_SVE_PRED)
> >        && known_gt (nelt, 1))
> >      {
> > +      /* If operand and result modes differ, then only check
> > +      for dup case.  */
> > +      if (d->vmode != op_mode)
> > +     return (d->vec_flags == VEC_SVE_DATA)
> > +             ? aarch64_evpc_sve_dup (d, op_mode) : false;
> > +
>
> I think it'd be more future-proof to format this as:
>
>     if (d->vmod == d->op_mode)
>       {
>         …existing code…
>       }
>     else
>       {
>         if (aarch64_evpc_sve_dup (d))
>           return true;
>       }
>
> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
> alongside the op_mode check.  I think we'll be adding more checks here
> over time.
Um I was wondering if we should structure it as:
if (d->vmode == d->op_mode)
  {
     ...existing code...
  }
if (aarch64_evpc_sve_dup (d))
  return true;

So we check for dup irrespective of  d->vmode == d->op_mode ?

Thanks,
Prathamesh
>
> >        if (aarch64_evpc_rev_local (d))
> >       return true;
> >        else if (aarch64_evpc_rev_global (d))
> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >        else if (aarch64_evpc_reencode (d))
> >       return true;
> >        if (d->vec_flags == VEC_SVE_DATA)
> > -     return aarch64_evpc_sve_tbl (d);
> > +     {
> > +       if (aarch64_evpc_sve_tbl (d))
> > +         return true;
> > +       else if (aarch64_evpc_sve_dup (d, op_mode))
> > +         return true;
> > +     }
> >        else if (d->vec_flags == VEC_ADVSIMD)
> >       return aarch64_evpc_tbl (d);
> >      }
>
> Is this part still needed, given the above?
>
> Thanks,
> Richard
>
> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> >                                 rtx target, rtx op0, rtx op1,
> >                                 const vec_perm_indices &sel)
> >  {
> > -  if (vmode != op_mode)
> > -    return false;
> > -
> >    struct expand_vec_perm_d d;
> >
> >    /* Check whether the mask can be applied to a single vector.  */
> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> >    d.testing_p = !target;
> >
> >    if (!d.testing_p)
> > -    return aarch64_expand_vec_perm_const_1 (&d);
> > +    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
> >
> >    rtx_insn *last = get_last_insn ();
> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
> >    gcc_assert (last == get_last_insn ());
> >
> >    return ret;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index bee410929bd..1a804b1ab73 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1208,64 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+	tree eltype = TREE_TYPE (lhs_type);
+
+	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (lhs_len, source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   TYPE_MODE (access_type),
+						   indices));
+	tree mask_type = build_vector_type (ssizetype, lhs_len);
+	tree mask = vec_perm_indices_to_tree (mask_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d4c575ce976..bb24701b0d2 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23395,8 +23395,10 @@ struct expand_vec_perm_d
 {
   rtx target, op0, op1;
   vec_perm_indices perm;
+  machine_mode op_mode;
   machine_mode vmode;
   unsigned int vec_flags;
+  unsigned int op_vec_flags;
   bool one_vector_p;
   bool testing_p;
 };
@@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || !d->one_vector_p
+      || d->vec_flags != VEC_SVE_DATA
+      || d->op_vec_flags != VEC_ADVSIMD
+      || d->perm.encoding ().nelts_per_pattern () != 1
+      || !known_eq (d->perm.encoding ().npatterns (),
+		    GET_MODE_NUNITS (d->op_mode)))
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24084,30 +24112,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
        || d->vec_flags == VEC_SVE_PRED)
       && known_gt (nelt, 1))
     {
-      if (aarch64_evpc_rev_local (d))
-	return true;
-      else if (aarch64_evpc_rev_global (d))
-	return true;
-      else if (aarch64_evpc_ext (d))
-	return true;
-      else if (aarch64_evpc_dup (d))
-	return true;
-      else if (aarch64_evpc_zip (d))
-	return true;
-      else if (aarch64_evpc_uzp (d))
-	return true;
-      else if (aarch64_evpc_trn (d))
-	return true;
-      else if (aarch64_evpc_sel (d))
-	return true;
-      else if (aarch64_evpc_ins (d))
-	return true;
-      else if (aarch64_evpc_reencode (d))
+      /* If operand and result modes differ, then only check
+	 for dup case.  */
+      if (d->vmode == d->op_mode)
+	{
+	  if (aarch64_evpc_rev_local (d))
+	    return true;
+	  else if (aarch64_evpc_rev_global (d))
+	    return true;
+	  else if (aarch64_evpc_ext (d))
+	    return true;
+	  else if (aarch64_evpc_dup (d))
+	    return true;
+	  else if (aarch64_evpc_zip (d))
+	    return true;
+	  else if (aarch64_evpc_uzp (d))
+	    return true;
+	  else if (aarch64_evpc_trn (d))
+	    return true;
+	  else if (aarch64_evpc_sel (d))
+	    return true;
+	  else if (aarch64_evpc_ins (d))
+	    return true;
+	  else if (aarch64_evpc_reencode (d))
+	    return true;
+
+	  if (d->vec_flags == VEC_SVE_DATA)
+	    return aarch64_evpc_sve_tbl (d);
+	  else if (d->vec_flags == VEC_ADVSIMD)
+	    return aarch64_evpc_tbl (d);
+	}
+
+      if (aarch64_evpc_sve_dup (d))
 	return true;
-      if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
-      else if (d->vec_flags == VEC_ADVSIMD)
-	return aarch64_evpc_tbl (d);
     }
   return false;
 }
@@ -24119,9 +24156,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  rtx target, rtx op0, rtx op1,
 				  const vec_perm_indices &sel)
 {
-  if (vmode != op_mode)
-    return false;
-
   struct expand_vec_perm_d d;
 
   /* Check whether the mask can be applied to a single vector.  */
@@ -24145,6 +24179,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 		     sel.nelts_per_input ());
   d.vmode = vmode;
   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+  d.op_mode = op_mode;
+  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
   d.target = target;
   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
   if (op0 == op1)
  
Richard Sandiford June 6, 2022, 10:59 a.m. UTC | #12
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >  {
>> >    /* The pattern matching functions above are written to look for a small
>> >       number to begin the sequence (0, 1, N/2).  If we begin with an index
>> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>> >         || d->vec_flags == VEC_SVE_PRED)
>> >        && known_gt (nelt, 1))
>> >      {
>> > +      /* If operand and result modes differ, then only check
>> > +      for dup case.  */
>> > +      if (d->vmode != op_mode)
>> > +     return (d->vec_flags == VEC_SVE_DATA)
>> > +             ? aarch64_evpc_sve_dup (d, op_mode) : false;
>> > +
>>
>> I think it'd be more future-proof to format this as:
>>
>>     if (d->vmod == d->op_mode)
>>       {
>>         …existing code…
>>       }
>>     else
>>       {
>>         if (aarch64_evpc_sve_dup (d))
>>           return true;
>>       }
>>
>> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
>> alongside the op_mode check.  I think we'll be adding more checks here
>> over time.
> Um I was wondering if we should structure it as:
> if (d->vmode == d->op_mode)
>   {
>      ...existing code...
>   }
> if (aarch64_evpc_sve_dup (d))
>   return true;
>
> So we check for dup irrespective of  d->vmode == d->op_mode ?

Yeah, I can see the attraction of that.  I think the else is better
though because the fallback TBL handling will (rightly) come at the end
of the existing code.  Without the else, we'd have specific tests like
DUP after generic ones like TBL, so the reader would have to work out
for themselves that DUP and TBL handle disjoint cases.

>> >        if (aarch64_evpc_rev_local (d))
>> >       return true;
>> >        else if (aarch64_evpc_rev_global (d))
>> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>> >        else if (aarch64_evpc_reencode (d))
>> >       return true;
>> >        if (d->vec_flags == VEC_SVE_DATA)
>> > -     return aarch64_evpc_sve_tbl (d);
>> > +     {
>> > +       if (aarch64_evpc_sve_tbl (d))
>> > +         return true;
>> > +       else if (aarch64_evpc_sve_dup (d, op_mode))
>> > +         return true;
>> > +     }
>> >        else if (d->vec_flags == VEC_ADVSIMD)
>> >       return aarch64_evpc_tbl (d);
>> >      }
>>
>> Is this part still needed, given the above?
>>
>> Thanks,
>> Richard
>>
>> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>> >                                 rtx target, rtx op0, rtx op1,
>> >                                 const vec_perm_indices &sel)
>> >  {
>> > -  if (vmode != op_mode)
>> > -    return false;
>> > -
>> >    struct expand_vec_perm_d d;
>> >
>> >    /* Check whether the mask can be applied to a single vector.  */
>> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>> >    d.testing_p = !target;
>> >
>> >    if (!d.testing_p)
>> > -    return aarch64_expand_vec_perm_const_1 (&d);
>> > +    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >
>> >    rtx_insn *last = get_last_insn ();
>> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
>> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >    gcc_assert (last == get_last_insn ());
>> >
>> >    return ret;
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index bee410929bd..1a804b1ab73 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,7 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "ssa.h"
>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1208,64 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const override
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.
> +       vectype is the corresponding ADVSIMD type.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs = gimple_call_lhs (f.call);
> +	tree lhs_type = TREE_TYPE (lhs);
> +	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> +	tree eltype = TREE_TYPE (lhs_type);
> +
> +	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
> +	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
> +	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> +
> +	tree elt_ptr_type
> +	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
> +	tree zero = build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> +
> +	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> +	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> +	gimple *mem_ref_stmt
> +	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> +	vec_perm_builder sel (lhs_len, source_nelts, 1);
> +	for (int i = 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> +						   TYPE_MODE (access_type),
> +						   indices));
> +	tree mask_type = build_vector_type (ssizetype, lhs_len);
> +	tree mask = vec_perm_indices_to_tree (mask_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR,
> +				    mem_ref_lhs, mem_ref_lhs, mask);
> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d4c575ce976..bb24701b0d2 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23395,8 +23395,10 @@ struct expand_vec_perm_d
>  {
>    rtx target, op0, op1;
>    vec_perm_indices perm;
> +  machine_mode op_mode;
>    machine_mode vmode;
>    unsigned int vec_flags;
> +  unsigned int op_vec_flags;

Very minor, but it would be good to keep the order consistent:
output mode first or input mode first.  Guess it might as well
be output mode first, to match the hook:

  machine_mode vmode;
  machine_mode op_mode;
  unsigned int vec_flags;
  unsigned int op_vec_flags;

>    bool one_vector_p;
>    bool testing_p;
>  };
> @@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || !d->one_vector_p
> +      || d->vec_flags != VEC_SVE_DATA
> +      || d->op_vec_flags != VEC_ADVSIMD

Sorry, one more: DUPQ only handles 128-bit AdvSIMD modes, so we also need:

      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128)

This isn't redundant with any of the other tests.

(We can use DUP .D for 64-bit input vectors, but that's a separate patch.)

OK with those changes (including using "else" :-)), thanks.

Richard

> +      || d->perm.encoding ().nelts_per_pattern () != 1
> +      || !known_eq (d->perm.encoding ().npatterns (),
> +		    GET_MODE_NUNITS (d->op_mode)))
> +    return false;
> +
> +  int npatterns = d->perm.encoding ().npatterns ();
> +  for (int i = 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>  
>  static bool
> @@ -24084,30 +24112,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>         || d->vec_flags == VEC_SVE_PRED)
>        && known_gt (nelt, 1))
>      {
> -      if (aarch64_evpc_rev_local (d))
> -	return true;
> -      else if (aarch64_evpc_rev_global (d))
> -	return true;
> -      else if (aarch64_evpc_ext (d))
> -	return true;
> -      else if (aarch64_evpc_dup (d))
> -	return true;
> -      else if (aarch64_evpc_zip (d))
> -	return true;
> -      else if (aarch64_evpc_uzp (d))
> -	return true;
> -      else if (aarch64_evpc_trn (d))
> -	return true;
> -      else if (aarch64_evpc_sel (d))
> -	return true;
> -      else if (aarch64_evpc_ins (d))
> -	return true;
> -      else if (aarch64_evpc_reencode (d))
> +      /* If operand and result modes differ, then only check
> +	 for dup case.  */
> +      if (d->vmode == d->op_mode)
> +	{
> +	  if (aarch64_evpc_rev_local (d))
> +	    return true;
> +	  else if (aarch64_evpc_rev_global (d))
> +	    return true;
> +	  else if (aarch64_evpc_ext (d))
> +	    return true;
> +	  else if (aarch64_evpc_dup (d))
> +	    return true;
> +	  else if (aarch64_evpc_zip (d))
> +	    return true;
> +	  else if (aarch64_evpc_uzp (d))
> +	    return true;
> +	  else if (aarch64_evpc_trn (d))
> +	    return true;
> +	  else if (aarch64_evpc_sel (d))
> +	    return true;
> +	  else if (aarch64_evpc_ins (d))
> +	    return true;
> +	  else if (aarch64_evpc_reencode (d))
> +	    return true;
> +
> +	  if (d->vec_flags == VEC_SVE_DATA)
> +	    return aarch64_evpc_sve_tbl (d);
> +	  else if (d->vec_flags == VEC_ADVSIMD)
> +	    return aarch64_evpc_tbl (d);
> +	}
> +
> +      if (aarch64_evpc_sve_dup (d))
>  	return true;
> -      if (d->vec_flags == VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> -      else if (d->vec_flags == VEC_ADVSIMD)
> -	return aarch64_evpc_tbl (d);
>      }
>    return false;
>  }
> @@ -24119,9 +24156,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>  				  rtx target, rtx op0, rtx op1,
>  				  const vec_perm_indices &sel)
>  {
> -  if (vmode != op_mode)
> -    return false;
> -
>    struct expand_vec_perm_d d;
>  
>    /* Check whether the mask can be applied to a single vector.  */
> @@ -24145,6 +24179,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>  		     sel.nelts_per_input ());
>    d.vmode = vmode;
>    d.vec_flags = aarch64_classify_vector_mode (d.vmode);
> +  d.op_mode = op_mode;
> +  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
>    d.target = target;
>    d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
>    if (op0 == op1)
  
Prathamesh Kulkarni June 7, 2022, 10:47 a.m. UTC | #13
On Mon, 6 Jun 2022 at 16:29, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >  {
> >> >    /* The pattern matching functions above are written to look for a small
> >> >       number to begin the sequence (0, 1, N/2).  If we begin with an index
> >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >> >         || d->vec_flags == VEC_SVE_PRED)
> >> >        && known_gt (nelt, 1))
> >> >      {
> >> > +      /* If operand and result modes differ, then only check
> >> > +      for dup case.  */
> >> > +      if (d->vmode != op_mode)
> >> > +     return (d->vec_flags == VEC_SVE_DATA)
> >> > +             ? aarch64_evpc_sve_dup (d, op_mode) : false;
> >> > +
> >>
> >> I think it'd be more future-proof to format this as:
> >>
> >>     if (d->vmod == d->op_mode)
> >>       {
> >>         …existing code…
> >>       }
> >>     else
> >>       {
> >>         if (aarch64_evpc_sve_dup (d))
> >>           return true;
> >>       }
> >>
> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
> >> alongside the op_mode check.  I think we'll be adding more checks here
> >> over time.
> > Um I was wondering if we should structure it as:
> > if (d->vmode == d->op_mode)
> >   {
> >      ...existing code...
> >   }
> > if (aarch64_evpc_sve_dup (d))
> >   return true;
> >
> > So we check for dup irrespective of  d->vmode == d->op_mode ?
>
> Yeah, I can see the attraction of that.  I think the else is better
> though because the fallback TBL handling will (rightly) come at the end
> of the existing code.  Without the else, we'd have specific tests like
> DUP after generic ones like TBL, so the reader would have to work out
> for themselves that DUP and TBL handle disjoint cases.
>
> >> >        if (aarch64_evpc_rev_local (d))
> >> >       return true;
> >> >        else if (aarch64_evpc_rev_global (d))
> >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >> >        else if (aarch64_evpc_reencode (d))
> >> >       return true;
> >> >        if (d->vec_flags == VEC_SVE_DATA)
> >> > -     return aarch64_evpc_sve_tbl (d);
> >> > +     {
> >> > +       if (aarch64_evpc_sve_tbl (d))
> >> > +         return true;
> >> > +       else if (aarch64_evpc_sve_dup (d, op_mode))
> >> > +         return true;
> >> > +     }
> >> >        else if (d->vec_flags == VEC_ADVSIMD)
> >> >       return aarch64_evpc_tbl (d);
> >> >      }
> >>
> >> Is this part still needed, given the above?
> >>
> >> Thanks,
> >> Richard
> >>
> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> >> >                                 rtx target, rtx op0, rtx op1,
> >> >                                 const vec_perm_indices &sel)
> >> >  {
> >> > -  if (vmode != op_mode)
> >> > -    return false;
> >> > -
> >> >    struct expand_vec_perm_d d;
> >> >
> >> >    /* Check whether the mask can be applied to a single vector.  */
> >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> >> >    d.testing_p = !target;
> >> >
> >> >    if (!d.testing_p)
> >> > -    return aarch64_expand_vec_perm_const_1 (&d);
> >> > +    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
> >> >
> >> >    rtx_insn *last = get_last_insn ();
> >> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
> >> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
> >> >    gcc_assert (last == get_last_insn ());
> >> >
> >> >    return ret;
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index bee410929bd..1a804b1ab73 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,7 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "ssa.h"
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1208,64 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const override
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> > +       into:
> > +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> > +       on little endian target.
> > +       vectype is the corresponding ADVSIMD type.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0))
> > +      {
> > +     tree lhs = gimple_call_lhs (f.call);
> > +     tree lhs_type = TREE_TYPE (lhs);
> > +     poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> > +     tree eltype = TREE_TYPE (lhs_type);
> > +
> > +     scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
> > +     machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
> > +     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> > +
> > +     tree elt_ptr_type
> > +       = build_pointer_type_for_mode (eltype, VOIDmode, true);
> > +     tree zero = build_zero_cst (elt_ptr_type);
> > +
> > +     /* Use element type alignment.  */
> > +     tree access_type
> > +       = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> > +
> > +     tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> > +     tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> > +     gimple *mem_ref_stmt
> > +       = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> > +     vec_perm_builder sel (lhs_len, source_nelts, 1);
> > +     for (int i = 0; i < source_nelts; i++)
> > +       sel.quick_push (i);
> > +
> > +     vec_perm_indices indices (sel, 1, source_nelts);
> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> > +                                                TYPE_MODE (access_type),
> > +                                                indices));
> > +     tree mask_type = build_vector_type (ssizetype, lhs_len);
> > +     tree mask = vec_perm_indices_to_tree (mask_type, indices);
> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR,
> > +                                 mem_ref_lhs, mem_ref_lhs, mask);
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index d4c575ce976..bb24701b0d2 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23395,8 +23395,10 @@ struct expand_vec_perm_d
> >  {
> >    rtx target, op0, op1;
> >    vec_perm_indices perm;
> > +  machine_mode op_mode;
> >    machine_mode vmode;
> >    unsigned int vec_flags;
> > +  unsigned int op_vec_flags;
>
> Very minor, but it would be good to keep the order consistent:
> output mode first or input mode first.  Guess it might as well
> be output mode first, to match the hook:
>
>   machine_mode vmode;
>   machine_mode op_mode;
>   unsigned int vec_flags;
>   unsigned int op_vec_flags;
>
> >    bool one_vector_p;
> >    bool testing_p;
> >  };
> > @@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >    return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +      || !d->one_vector_p
> > +      || d->vec_flags != VEC_SVE_DATA
> > +      || d->op_vec_flags != VEC_ADVSIMD
>
> Sorry, one more: DUPQ only handles 128-bit AdvSIMD modes, so we also need:
>
>       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128)
>
> This isn't redundant with any of the other tests.
>
> (We can use DUP .D for 64-bit input vectors, but that's a separate patch.)
>
> OK with those changes (including using "else" :-)), thanks.
Hi,
The patch regressed vdup_n_3.c and vzip_{2,3,4}.c because
aarch64_expand_vec_perm_const_1
was getting passed uninitialized values for d->op_mode and
d->op_vec_flags when called from
aarch64_evpc_reencode. The attached patch fixes the issue by setting
newd.op_mode to newd.vmode and likewise for op_vec_flags.
Does that look OK ?
Bootstrap+test in progress on aarch64-linux-gnu.

PS: How to bootstrap with SVE enabled ?
Shall make BOOT_CFLAGS="-mcpu=generic+sve" be sufficient ?
Currently I only tested the patch with normal bootstrap+test.

Thanks,
Prathamesh

>
> Richard
>
> > +      || d->perm.encoding ().nelts_per_pattern () != 1
> > +      || !known_eq (d->perm.encoding ().npatterns (),
> > +                 GET_MODE_NUNITS (d->op_mode)))
> > +    return false;
> > +
> > +  int npatterns = d->perm.encoding ().npatterns ();
> > +  for (int i = 0; i < npatterns; i++)
> > +    if (!known_eq (d->perm[i], i))
> > +      return false;
> > +
> > +  if (d->testing_p)
> > +    return true;
> > +
> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +  return true;
> > +}
> > +
> >  /* Try to implement D using SVE SEL instruction.  */
> >
> >  static bool
> > @@ -24084,30 +24112,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >         || d->vec_flags == VEC_SVE_PRED)
> >        && known_gt (nelt, 1))
> >      {
> > -      if (aarch64_evpc_rev_local (d))
> > -     return true;
> > -      else if (aarch64_evpc_rev_global (d))
> > -     return true;
> > -      else if (aarch64_evpc_ext (d))
> > -     return true;
> > -      else if (aarch64_evpc_dup (d))
> > -     return true;
> > -      else if (aarch64_evpc_zip (d))
> > -     return true;
> > -      else if (aarch64_evpc_uzp (d))
> > -     return true;
> > -      else if (aarch64_evpc_trn (d))
> > -     return true;
> > -      else if (aarch64_evpc_sel (d))
> > -     return true;
> > -      else if (aarch64_evpc_ins (d))
> > -     return true;
> > -      else if (aarch64_evpc_reencode (d))
> > +      /* If operand and result modes differ, then only check
> > +      for dup case.  */
> > +      if (d->vmode == d->op_mode)
> > +     {
> > +       if (aarch64_evpc_rev_local (d))
> > +         return true;
> > +       else if (aarch64_evpc_rev_global (d))
> > +         return true;
> > +       else if (aarch64_evpc_ext (d))
> > +         return true;
> > +       else if (aarch64_evpc_dup (d))
> > +         return true;
> > +       else if (aarch64_evpc_zip (d))
> > +         return true;
> > +       else if (aarch64_evpc_uzp (d))
> > +         return true;
> > +       else if (aarch64_evpc_trn (d))
> > +         return true;
> > +       else if (aarch64_evpc_sel (d))
> > +         return true;
> > +       else if (aarch64_evpc_ins (d))
> > +         return true;
> > +       else if (aarch64_evpc_reencode (d))
> > +         return true;
> > +
> > +       if (d->vec_flags == VEC_SVE_DATA)
> > +         return aarch64_evpc_sve_tbl (d);
> > +       else if (d->vec_flags == VEC_ADVSIMD)
> > +         return aarch64_evpc_tbl (d);
> > +     }
> > +
> > +      if (aarch64_evpc_sve_dup (d))
> >       return true;
> > -      if (d->vec_flags == VEC_SVE_DATA)
> > -     return aarch64_evpc_sve_tbl (d);
> > -      else if (d->vec_flags == VEC_ADVSIMD)
> > -     return aarch64_evpc_tbl (d);
> >      }
> >    return false;
> >  }
> > @@ -24119,9 +24156,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> >                                 rtx target, rtx op0, rtx op1,
> >                                 const vec_perm_indices &sel)
> >  {
> > -  if (vmode != op_mode)
> > -    return false;
> > -
> >    struct expand_vec_perm_d d;
> >
> >    /* Check whether the mask can be applied to a single vector.  */
> > @@ -24145,6 +24179,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> >                    sel.nelts_per_input ());
> >    d.vmode = vmode;
> >    d.vec_flags = aarch64_classify_vector_mode (d.vmode);
> > +  d.op_mode = op_mode;
> > +  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
> >    d.target = target;
> >    d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
> >    if (op0 == op1)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index bee410929bd..1a804b1ab73 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1208,64 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+	tree eltype = TREE_TYPE (lhs_type);
+
+	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (lhs_len, source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   TYPE_MODE (access_type),
+						   indices));
+	tree mask_type = build_vector_type (ssizetype, lhs_len);
+	tree mask = vec_perm_indices_to_tree (mask_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d4c575ce976..371174569f0 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23396,7 +23396,9 @@ struct expand_vec_perm_d
   rtx target, op0, op1;
   vec_perm_indices perm;
   machine_mode vmode;
+  machine_mode op_mode;
   unsigned int vec_flags;
+  unsigned int op_vec_flags;
   bool one_vector_p;
   bool testing_p;
 };
@@ -23631,6 +23633,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 
   newd.vmode = new_mode;
   newd.vec_flags = VEC_ADVSIMD;
+  newd.op_mode = newd.vmode;
+  newd.op_vec_flags = newd.vec_flags;
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
@@ -23945,6 +23949,33 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || !d->one_vector_p
+      || d->vec_flags != VEC_SVE_DATA
+      || d->op_vec_flags != VEC_ADVSIMD
+      || d->perm.encoding ().nelts_per_pattern () != 1
+      || !known_eq (d->perm.encoding ().npatterns (),
+		    GET_MODE_NUNITS (d->op_mode))
+      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24068,6 +24099,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
+  gcc_assert (d->op_mode != E_VOIDmode);
+
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
      from the second operand, we can swap the operands.  */
@@ -24084,30 +24117,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
        || d->vec_flags == VEC_SVE_PRED)
       && known_gt (nelt, 1))
     {
-      if (aarch64_evpc_rev_local (d))
-	return true;
-      else if (aarch64_evpc_rev_global (d))
-	return true;
-      else if (aarch64_evpc_ext (d))
-	return true;
-      else if (aarch64_evpc_dup (d))
-	return true;
-      else if (aarch64_evpc_zip (d))
-	return true;
-      else if (aarch64_evpc_uzp (d))
-	return true;
-      else if (aarch64_evpc_trn (d))
-	return true;
-      else if (aarch64_evpc_sel (d))
-	return true;
-      else if (aarch64_evpc_ins (d))
-	return true;
-      else if (aarch64_evpc_reencode (d))
-	return true;
-      if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
-      else if (d->vec_flags == VEC_ADVSIMD)
-	return aarch64_evpc_tbl (d);
+      if (d->vmode == d->op_mode)
+	{
+	  if (aarch64_evpc_rev_local (d))
+	    return true;
+	  else if (aarch64_evpc_rev_global (d))
+	    return true;
+	  else if (aarch64_evpc_ext (d))
+	    return true;
+	  else if (aarch64_evpc_dup (d))
+	    return true;
+	  else if (aarch64_evpc_zip (d))
+	    return true;
+	  else if (aarch64_evpc_uzp (d))
+	    return true;
+	  else if (aarch64_evpc_trn (d))
+	    return true;
+	  else if (aarch64_evpc_sel (d))
+	    return true;
+	  else if (aarch64_evpc_ins (d))
+	    return true;
+	  else if (aarch64_evpc_reencode (d))
+	    return true;
+
+	  if (d->vec_flags == VEC_SVE_DATA)
+	    return aarch64_evpc_sve_tbl (d);
+	  else if (d->vec_flags == VEC_ADVSIMD)
+	    return aarch64_evpc_tbl (d);
+	}
+      else
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	}
     }
   return false;
 }
@@ -24119,9 +24161,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  rtx target, rtx op0, rtx op1,
 				  const vec_perm_indices &sel)
 {
-  if (vmode != op_mode)
-    return false;
-
   struct expand_vec_perm_d d;
 
   /* Check whether the mask can be applied to a single vector.  */
@@ -24145,6 +24184,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 		     sel.nelts_per_input ());
   d.vmode = vmode;
   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+  d.op_mode = op_mode;
+  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
   d.target = target;
   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
   if (op0 == op1)
  
Richard Sandiford June 7, 2022, 11:02 a.m. UTC | #14
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Mon, 6 Jun 2022 at 16:29, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> >  {
>> >> >    /* The pattern matching functions above are written to look for a small
>> >> >       number to begin the sequence (0, 1, N/2).  If we begin with an index
>> >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>> >> >         || d->vec_flags == VEC_SVE_PRED)
>> >> >        && known_gt (nelt, 1))
>> >> >      {
>> >> > +      /* If operand and result modes differ, then only check
>> >> > +      for dup case.  */
>> >> > +      if (d->vmode != op_mode)
>> >> > +     return (d->vec_flags == VEC_SVE_DATA)
>> >> > +             ? aarch64_evpc_sve_dup (d, op_mode) : false;
>> >> > +
>> >>
>> >> I think it'd be more future-proof to format this as:
>> >>
>> >>     if (d->vmod == d->op_mode)
>> >>       {
>> >>         …existing code…
>> >>       }
>> >>     else
>> >>       {
>> >>         if (aarch64_evpc_sve_dup (d))
>> >>           return true;
>> >>       }
>> >>
>> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
>> >> alongside the op_mode check.  I think we'll be adding more checks here
>> >> over time.
>> > Um I was wondering if we should structure it as:
>> > if (d->vmode == d->op_mode)
>> >   {
>> >      ...existing code...
>> >   }
>> > if (aarch64_evpc_sve_dup (d))
>> >   return true;
>> >
>> > So we check for dup irrespective of  d->vmode == d->op_mode ?
>>
>> Yeah, I can see the attraction of that.  I think the else is better
>> though because the fallback TBL handling will (rightly) come at the end
>> of the existing code.  Without the else, we'd have specific tests like
>> DUP after generic ones like TBL, so the reader would have to work out
>> for themselves that DUP and TBL handle disjoint cases.
>>
>> >> >        if (aarch64_evpc_rev_local (d))
>> >> >       return true;
>> >> >        else if (aarch64_evpc_rev_global (d))
>> >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>> >> >        else if (aarch64_evpc_reencode (d))
>> >> >       return true;
>> >> >        if (d->vec_flags == VEC_SVE_DATA)
>> >> > -     return aarch64_evpc_sve_tbl (d);
>> >> > +     {
>> >> > +       if (aarch64_evpc_sve_tbl (d))
>> >> > +         return true;
>> >> > +       else if (aarch64_evpc_sve_dup (d, op_mode))
>> >> > +         return true;
>> >> > +     }
>> >> >        else if (d->vec_flags == VEC_ADVSIMD)
>> >> >       return aarch64_evpc_tbl (d);
>> >> >      }
>> >>
>> >> Is this part still needed, given the above?
>> >>
>> >> Thanks,
>> >> Richard
>> >>
>> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>> >> >                                 rtx target, rtx op0, rtx op1,
>> >> >                                 const vec_perm_indices &sel)
>> >> >  {
>> >> > -  if (vmode != op_mode)
>> >> > -    return false;
>> >> > -
>> >> >    struct expand_vec_perm_d d;
>> >> >
>> >> >    /* Check whether the mask can be applied to a single vector.  */
>> >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>> >> >    d.testing_p = !target;
>> >> >
>> >> >    if (!d.testing_p)
>> >> > -    return aarch64_expand_vec_perm_const_1 (&d);
>> >> > +    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >> >
>> >> >    rtx_insn *last = get_last_insn ();
>> >> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
>> >> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >> >    gcc_assert (last == get_last_insn ());
>> >> >
>> >> >    return ret;
>> >
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index bee410929bd..1a804b1ab73 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -44,6 +44,7 @@
>> >  #include "aarch64-sve-builtins-shapes.h"
>> >  #include "aarch64-sve-builtins-base.h"
>> >  #include "aarch64-sve-builtins-functions.h"
>> > +#include "ssa.h"
>> >
>> >  using namespace aarch64_sve;
>> >
>> > @@ -1207,6 +1208,64 @@ public:
>> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >      return e.use_contiguous_load_insn (icode);
>> >    }
>> > +
>> > +  gimple *
>> > +  fold (gimple_folder &f) const override
>> > +  {
>> > +    tree arg0 = gimple_call_arg (f.call, 0);
>> > +    tree arg1 = gimple_call_arg (f.call, 1);
>> > +
>> > +    /* Transform:
>> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
>> > +       into:
>> > +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
>> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
>> > +       on little endian target.
>> > +       vectype is the corresponding ADVSIMD type.  */
>> > +
>> > +    if (!BYTES_BIG_ENDIAN
>> > +     && integer_all_onesp (arg0))
>> > +      {
>> > +     tree lhs = gimple_call_lhs (f.call);
>> > +     tree lhs_type = TREE_TYPE (lhs);
>> > +     poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
>> > +     tree eltype = TREE_TYPE (lhs_type);
>> > +
>> > +     scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
>> > +     machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
>> > +     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>> > +
>> > +     tree elt_ptr_type
>> > +       = build_pointer_type_for_mode (eltype, VOIDmode, true);
>> > +     tree zero = build_zero_cst (elt_ptr_type);
>> > +
>> > +     /* Use element type alignment.  */
>> > +     tree access_type
>> > +       = build_aligned_type (vectype, TYPE_ALIGN (eltype));
>> > +
>> > +     tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
>> > +     tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
>> > +     gimple *mem_ref_stmt
>> > +       = gimple_build_assign (mem_ref_lhs, mem_ref_op);
>> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> > +
>> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
>> > +     vec_perm_builder sel (lhs_len, source_nelts, 1);
>> > +     for (int i = 0; i < source_nelts; i++)
>> > +       sel.quick_push (i);
>> > +
>> > +     vec_perm_indices indices (sel, 1, source_nelts);
>> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
>> > +                                                TYPE_MODE (access_type),
>> > +                                                indices));
>> > +     tree mask_type = build_vector_type (ssizetype, lhs_len);
>> > +     tree mask = vec_perm_indices_to_tree (mask_type, indices);
>> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR,
>> > +                                 mem_ref_lhs, mem_ref_lhs, mask);
>> > +      }
>> > +
>> > +    return NULL;
>> > +  }
>> >  };
>> >
>> >  class svld1ro_impl : public load_replicate
>> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > index d4c575ce976..bb24701b0d2 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -23395,8 +23395,10 @@ struct expand_vec_perm_d
>> >  {
>> >    rtx target, op0, op1;
>> >    vec_perm_indices perm;
>> > +  machine_mode op_mode;
>> >    machine_mode vmode;
>> >    unsigned int vec_flags;
>> > +  unsigned int op_vec_flags;
>>
>> Very minor, but it would be good to keep the order consistent:
>> output mode first or input mode first.  Guess it might as well
>> be output mode first, to match the hook:
>>
>>   machine_mode vmode;
>>   machine_mode op_mode;
>>   unsigned int vec_flags;
>>   unsigned int op_vec_flags;
>>
>> >    bool one_vector_p;
>> >    bool testing_p;
>> >  };
>> > @@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >    return true;
>> >  }
>> >
>> > +/* Try to implement D using SVE dup instruction.  */
>> > +
>> > +static bool
>> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> > +{
>> > +  if (BYTES_BIG_ENDIAN
>> > +      || !d->one_vector_p
>> > +      || d->vec_flags != VEC_SVE_DATA
>> > +      || d->op_vec_flags != VEC_ADVSIMD
>>
>> Sorry, one more: DUPQ only handles 128-bit AdvSIMD modes, so we also need:
>>
>>       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128)
>>
>> This isn't redundant with any of the other tests.
>>
>> (We can use DUP .D for 64-bit input vectors, but that's a separate patch.)
>>
>> OK with those changes (including using "else" :-)), thanks.
> Hi,
> The patch regressed vdup_n_3.c and vzip_{2,3,4}.c because
> aarch64_expand_vec_perm_const_1
> was getting passed uninitialized values for d->op_mode and
> d->op_vec_flags when called from
> aarch64_evpc_reencode. The attached patch fixes the issue by setting
> newd.op_mode to newd.vmode and likewise for op_vec_flags.
> Does that look OK ?
> Bootstrap+test in progress on aarch64-linux-gnu.

OK, thanks.

> PS: How to bootstrap with SVE enabled ?
> Shall make BOOT_CFLAGS="-mcpu=generic+sve" be sufficient ?
> Currently I only tested the patch with normal bootstrap+test.

That should work, but it's probably easier to configure with
--with-cpu=generic+sve instead (or just pick an actual SVE CPU
instead of generic+sve).  The testsuite will then be run with
SVE enabled too.

Richard
  

Patch

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 02e42a71e5e..e21bbec360c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1207,6 +1207,56 @@  public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, &v[0])
+       into:
+       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0)
+	&& TREE_CODE (arg1) == ADDR_EXPR)
+      {
+	tree t = TREE_OPERAND (arg1, 0);
+	if (TREE_CODE (t) == ARRAY_REF)
+	  {
+	    tree index = TREE_OPERAND (t, 1);
+	    t = TREE_OPERAND (t, 0);
+	    if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
+	      {
+		t = TREE_OPERAND (t, 0);
+		tree vectype = TREE_TYPE (t);
+		if (VECTOR_TYPE_P (vectype)
+		    && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
+		    && wi::to_wide (TYPE_SIZE (vectype)) == 128)
+		  {
+		    tree lhs = gimple_call_lhs (f.call);
+		    tree lhs_type = TREE_TYPE (lhs);
+		    int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+		    vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+		    for (int i = 0; i < source_nelts; i++)
+		      sel.quick_push (i);
+
+		    vec_perm_indices indices (sel, 1, source_nelts);
+		    if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
+		      return NULL;
+
+		    tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+		    return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
+		  }
+	      }
+	  }
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f07330cff4f..af27f550be3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23002,8 +23002,32 @@  aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
 
   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
+
   if (d->one_vector_p)
-    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+    {
+      bool use_dupq = false;
+      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
+      if (GET_CODE (sel) == CONST_VECTOR
+	  && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
+	  && CONST_VECTOR_DUPLICATE_P (sel))
+	  {
+	    unsigned nelts = const_vector_encoded_nelts (sel);
+	    unsigned i;
+	    for (i = 0; i < nelts; i++)
+	      {
+		rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
+		if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
+		  break;
+	      }
+	    if (i == nelts)
+	      use_dupq = true;
+	  }
+
+      if (use_dupq)
+	aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+      else
+	emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+    }
   else
     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
   return true;