diff mbox series

[1/2] PR96463 - aarch64 specific changes

Message ID CAAgBjM=meq6r-Yi3BJ6WWQ_eQHqV-u7nyN8NFmnxGefmeWp7hg@mail.gmail.com
State New
Headers show
Series [1/2] PR96463 - aarch64 specific changes | expand

Commit Message

Prathamesh Kulkarni Dec. 17, 2021, 10:04 a.m. UTC
Hi,
The patch folds:
lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
into:
lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
and expands above vec_perm_expr using aarch64_expand_sve_dupq.

With patch, for following test:
#include <arm_sve.h>
#include <arm_neon.h>

svint32_t
foo (int32x4_t x)
{
  return svld1rq (svptrue_b8 (), &x[0]);
}

it generates following code:
foo:
.LFB4350:
        dup     z0.q, z0.q[0]
        ret

and passes bootstrap+test on aarch64-linux-gnu.
But I am not sure if the changes to aarch64_evpc_sve_tbl
are correct.

Thanks,
Prathamesh

Comments

Richard Sandiford Dec. 17, 2021, 11:33 a.m. UTC | #1
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> Hi,
> The patch folds:
> lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> into:
> lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> and expands above vec_perm_expr using aarch64_expand_sve_dupq.
>
> With patch, for following test:
> #include <arm_sve.h>
> #include <arm_neon.h>
>
> svint32_t
> foo (int32x4_t x)
> {
>   return svld1rq (svptrue_b8 (), &x[0]);
> }
>
> it generates following code:
> foo:
> .LFB4350:
>         dup     z0.q, z0.q[0]
>         ret
>
> and passes bootstrap+test on aarch64-linux-gnu.
> But I am not sure if the changes to aarch64_evpc_sve_tbl
> are correct.

Just in case: I was only using int32x4_t in the PR as an example.
The same thing should work for all element types.

>
> Thanks,
> Prathamesh
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 02e42a71e5e..e21bbec360c 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -1207,6 +1207,56 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> +       into:
> +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> +       on little endian target.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0)
> +	&& TREE_CODE (arg1) == ADDR_EXPR)
> +      {
> +	tree t = TREE_OPERAND (arg1, 0);
> +	if (TREE_CODE (t) == ARRAY_REF)
> +	  {
> +	    tree index = TREE_OPERAND (t, 1);
> +	    t = TREE_OPERAND (t, 0);
> +	    if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> +	      {
> +		t = TREE_OPERAND (t, 0);
> +		tree vectype = TREE_TYPE (t);
> +		if (VECTOR_TYPE_P (vectype)
> +		    && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> +		    && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> +		  {

Since this is quite a specific pattern match, and since we now lower
arm_neon.h vld1* to normal gimple accesses, I think we should try the
“more generally” approach mentioned in the PR and see what the fallout
is.  That is, keep:

    if (!BYTES_BIG_ENDIAN
	&& integer_all_onesp (arg0)

If those conditions pass, create an Advanced SIMD access at address arg1,
using similar code to the handling of:

     BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
     BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
     BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)

in aarch64_general_gimple_fold_builtin.  (Would be good to move the
common code to aarch64.c so that both files can use it.)

> +		    tree lhs = gimple_call_lhs (f.call);
> +		    tree lhs_type = TREE_TYPE (lhs);
> +		    int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> +		    vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> +		    for (int i = 0; i < source_nelts; i++)
> +		      sel.quick_push (i);
> +
> +		    vec_perm_indices indices (sel, 1, source_nelts);
> +		    if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> +		      return NULL;

I don't think we need to check this: it should always be true.
Probably worth keeping as a gcc_checking_assert though.

> +
> +		    tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> +		    return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> +		  }
> +	      }
> +	  }
> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index f07330cff4f..af27f550be3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>  
>    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
>    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> +
>    if (d->one_vector_p)
> -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> +    {
> +      bool use_dupq = false;
> +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
> +      if (GET_CODE (sel) == CONST_VECTOR
> +	  && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> +	  && CONST_VECTOR_DUPLICATE_P (sel))
> +	  {
> +	    unsigned nelts = const_vector_encoded_nelts (sel);
> +	    unsigned i;
> +	    for (i = 0; i < nelts; i++)
> +	      {
> +		rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> +		if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> +		  break;
> +	      }
> +	    if (i == nelts)
> +	      use_dupq = true;
> +	  }
> +
> +      if (use_dupq)
> +	aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +      else
> +	emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> +    }

This shouldn't be a TBL but a new operation, handled by its own
aarch64_evpc_sve_* routine.  The check for the mask should then
be done on d->perm, to detect whether the permutation is one
that the new routine supports.

I think the requirements are:

- !BYTES_BIG_ENDIAN
- the source must be an Advanced SIMD vector
- the destination must be an SVE vector
- the permutation must be a duplicate (tested in the code above)
- the number of “patterns” in the permutation must equal the number of
  source elements
- element X of the permutation must equal X (tested in the code above)

The existing aarch64_evpc_* routines expect the source and target modes
to be the same, so we should only call them when that's true.

Thanks,
Richard
Prathamesh Kulkarni Dec. 27, 2021, 10:24 a.m. UTC | #2
On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > Hi,
> > The patch folds:
> > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > into:
> > lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> >
> > With patch, for following test:
> > #include <arm_sve.h>
> > #include <arm_neon.h>
> >
> > svint32_t
> > foo (int32x4_t x)
> > {
> >   return svld1rq (svptrue_b8 (), &x[0]);
> > }
> >
> > it generates following code:
> > foo:
> > .LFB4350:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > and passes bootstrap+test on aarch64-linux-gnu.
> > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > are correct.
>
> Just in case: I was only using int32x4_t in the PR as an example.
> The same thing should work for all element types.
>
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index 02e42a71e5e..e21bbec360c 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -1207,6 +1207,56 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> > +       into:
> > +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> > +       on little endian target.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0)
> > +     && TREE_CODE (arg1) == ADDR_EXPR)
> > +      {
> > +     tree t = TREE_OPERAND (arg1, 0);
> > +     if (TREE_CODE (t) == ARRAY_REF)
> > +       {
> > +         tree index = TREE_OPERAND (t, 1);
> > +         t = TREE_OPERAND (t, 0);
> > +         if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > +           {
> > +             t = TREE_OPERAND (t, 0);
> > +             tree vectype = TREE_TYPE (t);
> > +             if (VECTOR_TYPE_P (vectype)
> > +                 && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > +                 && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > +               {
>
> Since this is quite a specific pattern match, and since we now lower
> arm_neon.h vld1* to normal gimple accesses, I think we should try the
> “more generally” approach mentioned in the PR and see what the fallout
> is.  That is, keep:
>
>     if (!BYTES_BIG_ENDIAN
>         && integer_all_onesp (arg0)
>
> If those conditions pass, create an Advanced SIMD access at address arg1,
> using similar code to the handling of:
>
>      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
>      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
>      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
>
> in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> common code to aarch64.c so that both files can use it.)
>
> > +                 tree lhs = gimple_call_lhs (f.call);
> > +                 tree lhs_type = TREE_TYPE (lhs);
> > +                 int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > +                 vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > +                 for (int i = 0; i < source_nelts; i++)
> > +                   sel.quick_push (i);
> > +
> > +                 vec_perm_indices indices (sel, 1, source_nelts);
> > +                 if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> > +                   return NULL;
>
> I don't think we need to check this: it should always be true.
> Probably worth keeping as a gcc_checking_assert though.
>
> > +
> > +                 tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > +                 return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> > +               }
> > +           }
> > +       }
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index f07330cff4f..af27f550be3 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >
> >    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> >    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > +
> >    if (d->one_vector_p)
> > -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > +    {
> > +      bool use_dupq = false;
> > +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
> > +      if (GET_CODE (sel) == CONST_VECTOR
> > +       && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > +       && CONST_VECTOR_DUPLICATE_P (sel))
> > +       {
> > +         unsigned nelts = const_vector_encoded_nelts (sel);
> > +         unsigned i;
> > +         for (i = 0; i < nelts; i++)
> > +           {
> > +             rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> > +             if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> > +               break;
> > +           }
> > +         if (i == nelts)
> > +           use_dupq = true;
> > +       }
> > +
> > +      if (use_dupq)
> > +     aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +      else
> > +     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > +    }
>
> This shouldn't be a TBL but a new operation, handled by its own
> aarch64_evpc_sve_* routine.  The check for the mask should then
> be done on d->perm, to detect whether the permutation is one
> that the new routine supports.
>
> I think the requirements are:
>
> - !BYTES_BIG_ENDIAN
> - the source must be an Advanced SIMD vector
> - the destination must be an SVE vector
> - the permutation must be a duplicate (tested in the code above)
> - the number of “patterns” in the permutation must equal the number of
>   source elements
> - element X of the permutation must equal X (tested in the code above)
>
> The existing aarch64_evpc_* routines expect the source and target modes
> to be the same, so we should only call them when that's true.
Hi Richard,
Thanks for the suggestions, and sorry for late reply.
Does the following patch look OK (sans the refactoring of building mem_ref) ?
Passes bootstrap+test on aarch64-linux-gnu.

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 0d09fe9dd6d..656d39a741c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -47,6 +47,7 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "gimple-fold.h"
+#include "aarch64-builtins.h"
 
 #define v8qi_UP  E_V8QImode
 #define v8di_UP  E_V8DImode
@@ -128,46 +129,6 @@
 
 #define SIMD_MAX_BUILTIN_ARGS 5
 
-enum aarch64_type_qualifiers
-{
-  /* T foo.  */
-  qualifier_none = 0x0,
-  /* unsigned T foo.  */
-  qualifier_unsigned = 0x1, /* 1 << 0  */
-  /* const T foo.  */
-  qualifier_const = 0x2, /* 1 << 1  */
-  /* T *foo.  */
-  qualifier_pointer = 0x4, /* 1 << 2  */
-  /* Used when expanding arguments if an operand could
-     be an immediate.  */
-  qualifier_immediate = 0x8, /* 1 << 3  */
-  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
-  /* void foo (...).  */
-  qualifier_void = 0x20, /* 1 << 5  */
-  /* Some patterns may have internal operands, this qualifier is an
-     instruction to the initialisation code to skip this operand.  */
-  qualifier_internal = 0x40, /* 1 << 6  */
-  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
-     rather than using the type of the operand.  */
-  qualifier_map_mode = 0x80, /* 1 << 7  */
-  /* qualifier_pointer | qualifier_map_mode  */
-  qualifier_pointer_map_mode = 0x84,
-  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
-  qualifier_const_pointer_map_mode = 0x86,
-  /* Polynomial types.  */
-  qualifier_poly = 0x100,
-  /* Lane indices - must be in range, and flipped for bigendian.  */
-  qualifier_lane_index = 0x200,
-  /* Lane indices for single lane structure loads and stores.  */
-  qualifier_struct_load_store_lane_index = 0x400,
-  /* Lane indices selected in pairs. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_pair_index = 0x800,
-  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_quadtup_index = 0x1000,
-};
-
 /* Flags that describe what a function might do.  */
 const unsigned int FLAG_NONE = 0U;
 const unsigned int FLAG_READ_FPCR = 1U << 0;
@@ -671,44 +632,6 @@ const char *aarch64_scalar_builtin_types[] = {
   NULL
 };
 
-#define ENTRY(E, M, Q, G) E,
-enum aarch64_simd_type
-{
-#include "aarch64-simd-builtin-types.def"
-  ARM_NEON_H_TYPES_LAST
-};
-#undef ENTRY
-
-struct GTY(()) aarch64_simd_type_info
-{
-  enum aarch64_simd_type type;
-
-  /* Internal type name.  */
-  const char *name;
-
-  /* Internal type name(mangled).  The mangled names conform to the
-     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
-     Appendix A).  To qualify for emission with the mangled names defined in
-     that document, a vector type must not only be of the correct mode but also
-     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
-     types are registered by aarch64_init_simd_builtin_types ().  In other
-     words, vector types defined in other ways e.g. via vector_size attribute
-     will get default mangled names.  */
-  const char *mangle;
-
-  /* Internal type.  */
-  tree itype;
-
-  /* Element type.  */
-  tree eltype;
-
-  /* Machine mode the internal type maps to.  */
-  enum machine_mode mode;
-
-  /* Qualifiers.  */
-  enum aarch64_type_qualifiers q;
-};
-
 #define ENTRY(E, M, Q, G)  \
   {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q},
 static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = {
@@ -2796,6 +2719,14 @@ get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* Return aarch64_simd_type_info corresponding to TYPE.  */
+
+aarch64_simd_type_info
+aarch64_get_simd_info_for_type (enum aarch64_simd_type type)
+{
+  return aarch64_simd_types[type];
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h
new file mode 100644
index 00000000000..b395402379c
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -0,0 +1,85 @@
+#ifndef AARCH64_BUILTINS_H
+#define AARCH64_BUILTINS_H
+
+#define ENTRY(E, M, Q, G) E,
+enum aarch64_simd_type
+{
+#include "aarch64-simd-builtin-types.def"
+  ARM_NEON_H_TYPES_LAST
+};
+#undef ENTRY
+
+enum aarch64_type_qualifiers
+{
+  /* T foo.  */
+  qualifier_none = 0x0,
+  /* unsigned T foo.  */
+  qualifier_unsigned = 0x1, /* 1 << 0  */
+  /* const T foo.  */
+  qualifier_const = 0x2, /* 1 << 1  */
+  /* T *foo.  */
+  qualifier_pointer = 0x4, /* 1 << 2  */
+  /* Used when expanding arguments if an operand could
+     be an immediate.  */
+  qualifier_immediate = 0x8, /* 1 << 3  */
+  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
+  /* void foo (...).  */
+  qualifier_void = 0x20, /* 1 << 5  */
+  /* Some patterns may have internal operands, this qualifier is an
+     instruction to the initialisation code to skip this operand.  */
+  qualifier_internal = 0x40, /* 1 << 6  */
+  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
+     rather than using the type of the operand.  */
+  qualifier_map_mode = 0x80, /* 1 << 7  */
+  /* qualifier_pointer | qualifier_map_mode  */
+  qualifier_pointer_map_mode = 0x84,
+  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
+  qualifier_const_pointer_map_mode = 0x86,
+  /* Polynomial types.  */
+  qualifier_poly = 0x100,
+  /* Lane indices - must be in range, and flipped for bigendian.  */
+  qualifier_lane_index = 0x200,
+  /* Lane indices for single lane structure loads and stores.  */
+  qualifier_struct_load_store_lane_index = 0x400,
+  /* Lane indices selected in pairs. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
+};
+
+struct GTY(()) aarch64_simd_type_info
+{
+  enum aarch64_simd_type type;
+
+  /* Internal type name.  */
+  const char *name;
+
+  /* Internal type name(mangled).  The mangled names conform to the
+     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
+     Appendix A).  To qualify for emission with the mangled names defined in
+     that document, a vector type must not only be of the correct mode but also
+     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
+     types are registered by aarch64_init_simd_builtin_types ().  In other
+     words, vector types defined in other ways e.g. via vector_size attribute
+     will get default mangled names.  */
+  const char *mangle;
+
+  /* Internal type.  */
+  tree itype;
+
+  /* Element type.  */
+  tree eltype;
+
+  /* Machine mode the internal type maps to.  */
+  enum machine_mode mode;
+
+  /* Qualifiers.  */
+  enum aarch64_type_qualifiers q;
+};
+
+aarch64_simd_type_info aarch64_get_simd_info_for_type (enum aarch64_simd_type);
+
+#endif /* AARCH64_BUILTINS_H */
+
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 02e42a71e5e..51e6c1a9cc4 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,14 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "aarch64-builtins.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "tree-ssa-operands.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "value-range.h"
+#include "tree-ssanames.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1215,56 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1] 
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))	
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
+
+	tree tmp = make_ssa_name_fn (cfun, access_type, 0);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
+	tree vectype = TREE_TYPE (mem_ref_lhs);
+	tree lhs_type = TREE_TYPE (lhs);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
+	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f07330cff4f..dc6e5ca1e1d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23009,6 +23009,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
+      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -23169,7 +23198,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+        {
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	  else if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
new file mode 100644
index 00000000000..35100a9e01c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+svint32_t f1 (int32x4_t x)
+{
+  return svld1rq (svptrue_b8 (), &x[0]);
+}
+
+svint32_t f2 (int *x)
+{
+  return svld1rq (svptrue_b8 (), x);
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
Prathamesh Kulkarni May 3, 2022, 10:40 a.m. UTC | #3
On Mon, 27 Dec 2021 at 15:54, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > > Hi,
> > > The patch folds:
> > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > > into:
> > > lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> > > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> > >
> > > With patch, for following test:
> > > #include <arm_sve.h>
> > > #include <arm_neon.h>
> > >
> > > svint32_t
> > > foo (int32x4_t x)
> > > {
> > >   return svld1rq (svptrue_b8 (), &x[0]);
> > > }
> > >
> > > it generates following code:
> > > foo:
> > > .LFB4350:
> > >         dup     z0.q, z0.q[0]
> > >         ret
> > >
> > > and passes bootstrap+test on aarch64-linux-gnu.
> > > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > > are correct.
> >
> > Just in case: I was only using int32x4_t in the PR as an example.
> > The same thing should work for all element types.
> >
> > >
> > > Thanks,
> > > Prathamesh
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > index 02e42a71e5e..e21bbec360c 100644
> > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > @@ -1207,6 +1207,56 @@ public:
> > >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> > >      return e.use_contiguous_load_insn (icode);
> > >    }
> > > +
> > > +  gimple *
> > > +  fold (gimple_folder &f) const OVERRIDE
> > > +  {
> > > +    tree arg0 = gimple_call_arg (f.call, 0);
> > > +    tree arg1 = gimple_call_arg (f.call, 1);
> > > +
> > > +    /* Transform:
> > > +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> > > +       into:
> > > +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> > > +       on little endian target.  */
> > > +
> > > +    if (!BYTES_BIG_ENDIAN
> > > +     && integer_all_onesp (arg0)
> > > +     && TREE_CODE (arg1) == ADDR_EXPR)
> > > +      {
> > > +     tree t = TREE_OPERAND (arg1, 0);
> > > +     if (TREE_CODE (t) == ARRAY_REF)
> > > +       {
> > > +         tree index = TREE_OPERAND (t, 1);
> > > +         t = TREE_OPERAND (t, 0);
> > > +         if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > > +           {
> > > +             t = TREE_OPERAND (t, 0);
> > > +             tree vectype = TREE_TYPE (t);
> > > +             if (VECTOR_TYPE_P (vectype)
> > > +                 && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > > +                 && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > > +               {
> >
> > Since this is quite a specific pattern match, and since we now lower
> > arm_neon.h vld1* to normal gimple accesses, I think we should try the
> > “more generally” approach mentioned in the PR and see what the fallout
> > is.  That is, keep:
> >
> >     if (!BYTES_BIG_ENDIAN
> >         && integer_all_onesp (arg0)
> >
> > If those conditions pass, create an Advanced SIMD access at address arg1,
> > using similar code to the handling of:
> >
> >      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
> >      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
> >      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
> >
> > in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> > common code to aarch64.c so that both files can use it.)
> >
> > > +                 tree lhs = gimple_call_lhs (f.call);
> > > +                 tree lhs_type = TREE_TYPE (lhs);
> > > +                 int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > > +                 vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > > +                 for (int i = 0; i < source_nelts; i++)
> > > +                   sel.quick_push (i);
> > > +
> > > +                 vec_perm_indices indices (sel, 1, source_nelts);
> > > +                 if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> > > +                   return NULL;
> >
> > I don't think we need to check this: it should always be true.
> > Probably worth keeping as a gcc_checking_assert though.
> >
> > > +
> > > +                 tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > > +                 return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> > > +               }
> > > +           }
> > > +       }
> > > +      }
> > > +
> > > +    return NULL;
> > > +  }
> > >  };
> > >
> > >  class svld1ro_impl : public load_replicate
> > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > > index f07330cff4f..af27f550be3 100644
> > > --- a/gcc/config/aarch64/aarch64.c
> > > +++ b/gcc/config/aarch64/aarch64.c
> > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> > >
> > >    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> > >    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > > +
> > >    if (d->one_vector_p)
> > > -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > > +    {
> > > +      bool use_dupq = false;
> > > +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
> > > +      if (GET_CODE (sel) == CONST_VECTOR
> > > +       && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > > +       && CONST_VECTOR_DUPLICATE_P (sel))
> > > +       {
> > > +         unsigned nelts = const_vector_encoded_nelts (sel);
> > > +         unsigned i;
> > > +         for (i = 0; i < nelts; i++)
> > > +           {
> > > +             rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> > > +             if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> > > +               break;
> > > +           }
> > > +         if (i == nelts)
> > > +           use_dupq = true;
> > > +       }
> > > +
> > > +      if (use_dupq)
> > > +     aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > > +      else
> > > +     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> > > +    }
> >
> > This shouldn't be a TBL but a new operation, handled by its own
> > aarch64_evpc_sve_* routine.  The check for the mask should then
> > be done on d->perm, to detect whether the permutation is one
> > that the new routine supports.
> >
> > I think the requirements are:
> >
> > - !BYTES_BIG_ENDIAN
> > - the source must be an Advanced SIMD vector
> > - the destination must be an SVE vector
> > - the permutation must be a duplicate (tested in the code above)
> > - the number of “patterns” in the permutation must equal the number of
> >   source elements
> > - element X of the permutation must equal X (tested in the code above)
> >
> > The existing aarch64_evpc_* routines expect the source and target modes
> > to be the same, so we should only call them when that's true.
> Hi Richard,
> Thanks for the suggestions, and sorry for late reply.
> Does the following patch look OK (sans the refactoring of building mem_ref) ?
> Passes bootstrap+test on aarch64-linux-gnu.
Hi Richard,
Since stage-1 has reopened, does the attached patch look OK to commit ?

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Richard
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index c21476d7ae9..cfcd9117ce3 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -47,6 +47,7 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "gimple-fold.h"
+#include "aarch64-builtins.h"
 
 #define v8qi_UP  E_V8QImode
 #define v8di_UP  E_V8DImode
@@ -128,46 +129,6 @@
 
 #define SIMD_MAX_BUILTIN_ARGS 5
 
-enum aarch64_type_qualifiers
-{
-  /* T foo.  */
-  qualifier_none = 0x0,
-  /* unsigned T foo.  */
-  qualifier_unsigned = 0x1, /* 1 << 0  */
-  /* const T foo.  */
-  qualifier_const = 0x2, /* 1 << 1  */
-  /* T *foo.  */
-  qualifier_pointer = 0x4, /* 1 << 2  */
-  /* Used when expanding arguments if an operand could
-     be an immediate.  */
-  qualifier_immediate = 0x8, /* 1 << 3  */
-  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
-  /* void foo (...).  */
-  qualifier_void = 0x20, /* 1 << 5  */
-  /* Some patterns may have internal operands, this qualifier is an
-     instruction to the initialisation code to skip this operand.  */
-  qualifier_internal = 0x40, /* 1 << 6  */
-  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
-     rather than using the type of the operand.  */
-  qualifier_map_mode = 0x80, /* 1 << 7  */
-  /* qualifier_pointer | qualifier_map_mode  */
-  qualifier_pointer_map_mode = 0x84,
-  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
-  qualifier_const_pointer_map_mode = 0x86,
-  /* Polynomial types.  */
-  qualifier_poly = 0x100,
-  /* Lane indices - must be in range, and flipped for bigendian.  */
-  qualifier_lane_index = 0x200,
-  /* Lane indices for single lane structure loads and stores.  */
-  qualifier_struct_load_store_lane_index = 0x400,
-  /* Lane indices selected in pairs. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_pair_index = 0x800,
-  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_quadtup_index = 0x1000,
-};
-
 /* Flags that describe what a function might do.  */
 const unsigned int FLAG_NONE = 0U;
 const unsigned int FLAG_READ_FPCR = 1U << 0;
@@ -671,44 +632,6 @@ const char *aarch64_scalar_builtin_types[] = {
   NULL
 };
 
-#define ENTRY(E, M, Q, G) E,
-enum aarch64_simd_type
-{
-#include "aarch64-simd-builtin-types.def"
-  ARM_NEON_H_TYPES_LAST
-};
-#undef ENTRY
-
-struct GTY(()) aarch64_simd_type_info
-{
-  enum aarch64_simd_type type;
-
-  /* Internal type name.  */
-  const char *name;
-
-  /* Internal type name(mangled).  The mangled names conform to the
-     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
-     Appendix A).  To qualify for emission with the mangled names defined in
-     that document, a vector type must not only be of the correct mode but also
-     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
-     types are registered by aarch64_init_simd_builtin_types ().  In other
-     words, vector types defined in other ways e.g. via vector_size attribute
-     will get default mangled names.  */
-  const char *mangle;
-
-  /* Internal type.  */
-  tree itype;
-
-  /* Element type.  */
-  tree eltype;
-
-  /* Machine mode the internal type maps to.  */
-  enum machine_mode mode;
-
-  /* Qualifiers.  */
-  enum aarch64_type_qualifiers q;
-};
-
 #define ENTRY(E, M, Q, G)  \
   {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q},
 static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = {
@@ -2826,6 +2749,14 @@ get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* Return aarch64_simd_type_info corresponding to TYPE.  */
+
+aarch64_simd_type_info
+aarch64_get_simd_info_for_type (enum aarch64_simd_type type)
+{
+  return aarch64_simd_types[type];
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h
new file mode 100644
index 00000000000..4d155566dc5
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -0,0 +1,101 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_BUILTINS_H
+#define AARCH64_BUILTINS_H
+
+#define ENTRY(E, M, Q, G) E,
+enum aarch64_simd_type
+{
+#include "aarch64-simd-builtin-types.def"
+  ARM_NEON_H_TYPES_LAST
+};
+#undef ENTRY
+
+enum aarch64_type_qualifiers
+{
+  /* T foo.  */
+  qualifier_none = 0x0,
+  /* unsigned T foo.  */
+  qualifier_unsigned = 0x1, /* 1 << 0  */
+  /* const T foo.  */
+  qualifier_const = 0x2, /* 1 << 1  */
+  /* T *foo.  */
+  qualifier_pointer = 0x4, /* 1 << 2  */
+  /* Used when expanding arguments if an operand could
+     be an immediate.  */
+  qualifier_immediate = 0x8, /* 1 << 3  */
+  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
+  /* void foo (...).  */
+  qualifier_void = 0x20, /* 1 << 5  */
+  /* Some patterns may have internal operands, this qualifier is an
+     instruction to the initialisation code to skip this operand.  */
+  qualifier_internal = 0x40, /* 1 << 6  */
+  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
+     rather than using the type of the operand.  */
+  qualifier_map_mode = 0x80, /* 1 << 7  */
+  /* qualifier_pointer | qualifier_map_mode  */
+  qualifier_pointer_map_mode = 0x84,
+  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
+  qualifier_const_pointer_map_mode = 0x86,
+  /* Polynomial types.  */
+  qualifier_poly = 0x100,
+  /* Lane indices - must be in range, and flipped for bigendian.  */
+  qualifier_lane_index = 0x200,
+  /* Lane indices for single lane structure loads and stores.  */
+  qualifier_struct_load_store_lane_index = 0x400,
+  /* Lane indices selected in pairs. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
+};
+
+struct GTY(()) aarch64_simd_type_info
+{
+  enum aarch64_simd_type type;
+
+  /* Internal type name.  */
+  const char *name;
+
+  /* Internal type name(mangled).  The mangled names conform to the
+     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
+     Appendix A).  To qualify for emission with the mangled names defined in
+     that document, a vector type must not only be of the correct mode but also
+     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
+     types are registered by aarch64_init_simd_builtin_types ().  In other
+     words, vector types defined in other ways e.g. via vector_size attribute
+     will get default mangled names.  */
+  const char *mangle;
+
+  /* Internal type.  */
+  tree itype;
+
+  /* Element type.  */
+  tree eltype;
+
+  /* Machine mode the internal type maps to.  */
+  enum machine_mode mode;
+
+  /* Qualifiers.  */
+  enum aarch64_type_qualifiers q;
+};
+
+aarch64_simd_type_info aarch64_get_simd_info_for_type (enum aarch64_simd_type);
+
+#endif /* AARCH64_BUILTINS_H */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c0548724..1ef4ea2087b 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,14 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "aarch64-builtins.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "tree-ssa-operands.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "value-range.h"
+#include "tree-ssanames.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1215,56 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
+
+	tree tmp = make_ssa_name_fn (cfun, access_type, 0);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
+	tree vectype = TREE_TYPE (mem_ref_lhs);
+	tree lhs_type = TREE_TYPE (lhs);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
+	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce..47810fec804 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
+      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	  else if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
new file mode 100644
index 00000000000..35100a9e01c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+svint32_t f1 (int32x4_t x)
+{
+  return svld1rq (svptrue_b8 (), &x[0]);
+}
+
+svint32_t f2 (int *x)
+{
+  return svld1rq (svptrue_b8 (), x);
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
Richard Sandiford May 6, 2022, 10:30 a.m. UTC | #4
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index c24c0548724..1ef4ea2087b 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,14 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "aarch64-builtins.h"
> +#include "gimple-ssa.h"
> +#include "tree-phinodes.h"
> +#include "tree-ssa-operands.h"
> +#include "ssa-iterators.h"
> +#include "stringpool.h"
> +#include "value-range.h"
> +#include "tree-ssanames.h"

Minor, but: I think the preferred approach is to include "ssa.h"
rather than include some of these headers directly.

>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1215,56 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs = gimple_call_lhs (f.call);
> +	auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);

Does this work for other element sizes?  I would have expected it
to be the (128-bit) Advanced SIMD vector associated with the same
element type as the SVE vector.

The testcase should cover more than just int32x4_t -> svint32_t,
just to be sure.

> +
> +	tree elt_ptr_type
> +	  = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> +	tree zero = build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> +
> +	tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> +	gimple *mem_ref_stmt
> +	  = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));

Long line.  Might be easier to format by assigning the fold_build2 result
to a temporary variable.

> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> +	tree vectype = TREE_TYPE (mem_ref_lhs);
> +	tree lhs_type = TREE_TYPE (lhs);

Is this necessary?  The code above supplied the types and I wouldn't
have expected them to change during the build process.

> +
> +	int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> +	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> +	for (int i = 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> +	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);

Nit: long line.

> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f650abbc4ce..47810fec804 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || d->perm.length ().is_constant ()
> +      || !d->one_vector_p
> +      || d->target == NULL
> +      || d->op0 == NULL

These last two lines mean that we always return false for d->testing.
The idea instead is that the return value should be the same for both
d->testing and !d->testing.  The difference is that for !d->testing we
also emit code to do the permute.

> +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()

Sorry, I've forgotten the context now, but: these positive tests
for is_constant surprised me.  Do we really only want to do this
for variable-length SVE code generation, rather than fixed-length?

> +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;

This should happen after the later tests, once we're sure that the
permute vector has the right form.  If the issue is that op0 isn't
provided for testing then I think the hook needs to be passed the
input mode alongside the result mode.

It might then be better to test:

  aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD

(despite what I said earlier, about testing is_constant, sorry).

> +
> +  int npatterns = d->perm.encoding ().npatterns ();
> +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> +    return false;
> +
> +  for (int i = 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>  
>  static bool
> @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>        else if (aarch64_evpc_reencode (d))
>  	return true;
>        if (d->vec_flags == VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> +	{
> +	  if (aarch64_evpc_sve_dup (d))
> +	    return true;
> +	  else if (aarch64_evpc_sve_tbl (d))
> +	    return true;
> +	}
>        else if (d->vec_flags == VEC_ADVSIMD)
>  	return aarch64_evpc_tbl (d);
>      }
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> new file mode 100644
> index 00000000000..35100a9e01c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include "arm_neon.h"
> +#include "arm_sve.h"
> +
> +svint32_t f1 (int32x4_t x)
> +{
> +  return svld1rq (svptrue_b8 (), &x[0]);
> +}
> +
> +svint32_t f2 (int *x)
> +{
> +  return svld1rq (svptrue_b8 (), x);
> +}
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
Prathamesh Kulkarni May 11, 2022, 6:24 a.m. UTC | #5
On Fri, 6 May 2022 at 16:00, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index c24c0548724..1ef4ea2087b 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,14 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "aarch64-builtins.h"
> > +#include "gimple-ssa.h"
> > +#include "tree-phinodes.h"
> > +#include "tree-ssa-operands.h"
> > +#include "ssa-iterators.h"
> > +#include "stringpool.h"
> > +#include "value-range.h"
> > +#include "tree-ssanames.h"
>
> Minor, but: I think the preferred approach is to include "ssa.h"
> rather than include some of these headers directly.
>
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1215,56 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> > +       into:
> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> > +       on little endian target.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0))
> > +      {
> > +     tree lhs = gimple_call_lhs (f.call);
> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>
> Does this work for other element sizes?  I would have expected it
> to be the (128-bit) Advanced SIMD vector associated with the same
> element type as the SVE vector.
>
> The testcase should cover more than just int32x4_t -> svint32_t,
> just to be sure.
In the attached patch, it obtains corresponding advsimd type with:

tree eltype = TREE_TYPE (lhs_type);
unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
tree vectype = build_vector_type (eltype, nunits);

While this seems to work with different element sizes, I am not sure if it's
the correct approach ?
>
> > +
> > +     tree elt_ptr_type
> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> > +     tree zero = build_zero_cst (elt_ptr_type);
> > +
> > +     /* Use element type alignment.  */
> > +     tree access_type
> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> > +
> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> > +     gimple *mem_ref_stmt
> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>
> Long line.  Might be easier to format by assigning the fold_build2 result
> to a temporary variable.
>
> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
> > +     tree lhs_type = TREE_TYPE (lhs);
>
> Is this necessary?  The code above supplied the types and I wouldn't
> have expected them to change during the build process.
>
> > +
> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > +     for (int i = 0; i < source_nelts; i++)
> > +       sel.quick_push (i);
> > +
> > +     vec_perm_indices indices (sel, 1, source_nelts);
> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>
> Nit: long line.
>
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index f650abbc4ce..47810fec804 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >    return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +      || d->perm.length ().is_constant ()
> > +      || !d->one_vector_p
> > +      || d->target == NULL
> > +      || d->op0 == NULL
>
> These last two lines mean that we always return false for d->testing.
> The idea instead is that the return value should be the same for both
> d->testing and !d->testing.  The difference is that for !d->testing we
> also emit code to do the permute.
>
> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>
> Sorry, I've forgotten the context now, but: these positive tests
> for is_constant surprised me.  Do we really only want to do this
> for variable-length SVE code generation, rather than fixed-length?
>
> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> > +    return false;
> > +
> > +  if (d->testing_p)
> > +    return true;
>
> This should happen after the later tests, once we're sure that the
> permute vector has the right form.  If the issue is that op0 isn't
> provided for testing then I think the hook needs to be passed the
> input mode alongside the result mode.
>
> It might then be better to test:
>
>   aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD
>
> (despite what I said earlier, about testing is_constant, sorry).
Thanks for the suggestions, I tried to address them in the attached patch.
Does it look OK after bootstrap+test ?

The patch seems to generate the same code for different vector types.
For eg:

svint32_t foo (int32x4_t x)
{
  return svld1rq (svptrue_b8 (), &x[0]);
}

svint16_t foo2(int16x8_t x)
{
  return svld1rq_s16 (svptrue_b8 (), &x[0]);
}

.optimized dump:
;; Function foo (foo, funcdef_no=4350, decl_uid=29928,
cgraph_uid=4351, symbol_order=4350)
svint32_t foo (int32x4_t x)
{
  svint32_t _2;

  <bb 2> [local count: 1073741824]:
  _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, ... }>;
  return _2;

}

;; Function foo2 (foo2, funcdef_no=4351, decl_uid=29931,
cgraph_uid=4352, symbol_order=4351)

svint16_t foo2 (int16x8_t x)
{
  svint16_t _2;

  <bb 2> [local count: 1073741824]:
  _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, 4, 5, 6, 7, ... }>;
  return _2;

}

resulting in code-gen:
foo:
        dup     z0.q, z0.q[0]
        ret

foo2:
        dup     z0.q, z0.q[0]
        ret

I suppose this is correct, since in both cases it's replicating the
entire 128-bit vector (irrespective of element sizes) ?

Thanks,
Prathamesh
>
> > +
> > +  int npatterns = d->perm.encoding ().npatterns ();
> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> > +    return false;
> > +
> > +  for (int i = 0; i < npatterns; i++)
> > +    if (!known_eq (d->perm[i], i))
> > +      return false;
> > +
> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +  return true;
> > +}
> > +
> >  /* Try to implement D using SVE SEL instruction.  */
> >
> >  static bool
> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >        else if (aarch64_evpc_reencode (d))
> >       return true;
> >        if (d->vec_flags == VEC_SVE_DATA)
> > -     return aarch64_evpc_sve_tbl (d);
> > +     {
> > +       if (aarch64_evpc_sve_dup (d))
> > +         return true;
> > +       else if (aarch64_evpc_sve_tbl (d))
> > +         return true;
> > +     }
> >        else if (d->vec_flags == VEC_ADVSIMD)
> >       return aarch64_evpc_tbl (d);
> >      }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> > new file mode 100644
> > index 00000000000..35100a9e01c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include "arm_neon.h"
> > +#include "arm_sve.h"
> > +
> > +svint32_t f1 (int32x4_t x)
> > +{
> > +  return svld1rq (svptrue_b8 (), &x[0]);
> > +}
> > +
> > +svint32_t f2 (int *x)
> > +{
> > +  return svld1rq (svptrue_b8 (), x);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c0548724..8a2e5b886e4 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1208,59 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	tree eltype = TREE_TYPE (lhs_type);
+	unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
+	tree vectype = build_vector_type (eltype, nunits);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   indices));
+	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce..072ec9bd153 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || (aarch64_classify_vector_mode (GET_MODE (d->target)) & VEC_ANY_SVE) == 0
+      || (aarch64_classify_vector_mode (GET_MODE (d->op0)) & VEC_ADVSIMD) == 0)
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	  else if (aarch64_evpc_sve_tbl (d))
+	    return true;
+	}
       else if (d->vec_flags == VEC_ADVSIMD)
 	return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
new file mode 100644
index 00000000000..5af3b6ed24c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8x16_t, s8)
+TEST(svint16_t, int16x8_t, s16)
+TEST(svint32_t, int32x4_t, s32)
+TEST(svint64_t, int64x2_t, s64)
+
+TEST(svuint8_t, uint8x16_t, u8)
+TEST(svuint16_t, uint16x8_t, u16)
+TEST(svuint32_t, uint32x4_t, u32)
+TEST(svuint64_t, uint64x2_t, u64)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
new file mode 100644
index 00000000000..17e78c57c1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type *x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8_t, s8)
+TEST(svint16_t, int16_t, s16)
+TEST(svint32_t, int32_t, s32)
+TEST(svint64_t, int64_t, s64)
+
+TEST(svuint8_t, uint8_t, u8)
+TEST(svuint16_t, uint16_t, u16)
+TEST(svuint32_t, uint32_t, u32)
+TEST(svuint64_t, uint64_t, u64)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
Richard Sandiford May 11, 2022, 7:14 a.m. UTC | #6
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Fri, 6 May 2022 at 16:00, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index c24c0548724..1ef4ea2087b 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -44,6 +44,14 @@
>> >  #include "aarch64-sve-builtins-shapes.h"
>> >  #include "aarch64-sve-builtins-base.h"
>> >  #include "aarch64-sve-builtins-functions.h"
>> > +#include "aarch64-builtins.h"
>> > +#include "gimple-ssa.h"
>> > +#include "tree-phinodes.h"
>> > +#include "tree-ssa-operands.h"
>> > +#include "ssa-iterators.h"
>> > +#include "stringpool.h"
>> > +#include "value-range.h"
>> > +#include "tree-ssanames.h"
>>
>> Minor, but: I think the preferred approach is to include "ssa.h"
>> rather than include some of these headers directly.
>>
>> >
>> >  using namespace aarch64_sve;
>> >
>> > @@ -1207,6 +1215,56 @@ public:
>> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >      return e.use_contiguous_load_insn (icode);
>> >    }
>> > +
>> > +  gimple *
>> > +  fold (gimple_folder &f) const OVERRIDE
>> > +  {
>> > +    tree arg0 = gimple_call_arg (f.call, 0);
>> > +    tree arg1 = gimple_call_arg (f.call, 1);
>> > +
>> > +    /* Transform:
>> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
>> > +       into:
>> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
>> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
>> > +       on little endian target.  */
>> > +
>> > +    if (!BYTES_BIG_ENDIAN
>> > +     && integer_all_onesp (arg0))
>> > +      {
>> > +     tree lhs = gimple_call_lhs (f.call);
>> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>>
>> Does this work for other element sizes?  I would have expected it
>> to be the (128-bit) Advanced SIMD vector associated with the same
>> element type as the SVE vector.
>>
>> The testcase should cover more than just int32x4_t -> svint32_t,
>> just to be sure.
> In the attached patch, it obtains corresponding advsimd type with:
>
> tree eltype = TREE_TYPE (lhs_type);
> unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> tree vectype = build_vector_type (eltype, nunits);
>
> While this seems to work with different element sizes, I am not sure if it's
> the correct approach ?

Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
to get the vector mode associated with a .Q “element”, so an
alternative would be:

    machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
    tree vectype = build_vector_type_for_mode (eltype, vq_mode);

which is more explicit about wanting an Advanced SIMD vector.

>> > +
>> > +     tree elt_ptr_type
>> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
>> > +     tree zero = build_zero_cst (elt_ptr_type);
>> > +
>> > +     /* Use element type alignment.  */
>> > +     tree access_type
>> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
>> > +
>> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> > +     gimple *mem_ref_stmt
>> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>>
>> Long line.  Might be easier to format by assigning the fold_build2 result
>> to a temporary variable.
>>
>> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> > +
>> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
>> > +     tree lhs_type = TREE_TYPE (lhs);
>>
>> Is this necessary?  The code above supplied the types and I wouldn't
>> have expected them to change during the build process.
>>
>> > +
>> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
>> > +     for (int i = 0; i < source_nelts; i++)
>> > +       sel.quick_push (i);
>> > +
>> > +     vec_perm_indices indices (sel, 1, source_nelts);
>> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
>> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>>
>> Nit: long line.
>>
>> > +      }
>> > +
>> > +    return NULL;
>> > +  }
>> >  };
>> >
>> >  class svld1ro_impl : public load_replicate
>> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > index f650abbc4ce..47810fec804 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >    return true;
>> >  }
>> >
>> > +/* Try to implement D using SVE dup instruction.  */
>> > +
>> > +static bool
>> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> > +{
>> > +  if (BYTES_BIG_ENDIAN
>> > +      || d->perm.length ().is_constant ()
>> > +      || !d->one_vector_p
>> > +      || d->target == NULL
>> > +      || d->op0 == NULL
>>
>> These last two lines mean that we always return false for d->testing.
>> The idea instead is that the return value should be the same for both
>> d->testing and !d->testing.  The difference is that for !d->testing we
>> also emit code to do the permute.

It doesn't look like the new patch addresses this.  There should be
no checks for/uses of “d->target” and “d->op0” until after:

  if (d->testing_p)
    return true;

This...  

>> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>>
>> Sorry, I've forgotten the context now, but: these positive tests
>> for is_constant surprised me.  Do we really only want to do this
>> for variable-length SVE code generation, rather than fixed-length?
>>
>> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
>> > +    return false;
>> > +
>> > +  if (d->testing_p)
>> > +    return true;
>>
>> This should happen after the later tests, once we're sure that the
>> permute vector has the right form.  If the issue is that op0 isn't
>> provided for testing then I think the hook needs to be passed the
>> input mode alongside the result mode.

...was my guess about why the checks were there.

>> It might then be better to test:
>>
>>   aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD
>>
>> (despite what I said earlier, about testing is_constant, sorry).
> Thanks for the suggestions, I tried to address them in the attached patch.
> Does it look OK after bootstrap+test ?
>
> The patch seems to generate the same code for different vector types.
> For eg:
>
> svint32_t foo (int32x4_t x)
> {
>   return svld1rq (svptrue_b8 (), &x[0]);
> }
>
> svint16_t foo2(int16x8_t x)
> {
>   return svld1rq_s16 (svptrue_b8 (), &x[0]);
> }
>
> .optimized dump:
> ;; Function foo (foo, funcdef_no=4350, decl_uid=29928,
> cgraph_uid=4351, symbol_order=4350)
> svint32_t foo (int32x4_t x)
> {
>   svint32_t _2;
>
>   <bb 2> [local count: 1073741824]:
>   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, ... }>;
>   return _2;
>
> }
>
> ;; Function foo2 (foo2, funcdef_no=4351, decl_uid=29931,
> cgraph_uid=4352, symbol_order=4351)
>
> svint16_t foo2 (int16x8_t x)
> {
>   svint16_t _2;
>
>   <bb 2> [local count: 1073741824]:
>   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, 4, 5, 6, 7, ... }>;
>   return _2;
>
> }
>
> resulting in code-gen:
> foo:
>         dup     z0.q, z0.q[0]
>         ret
>
> foo2:
>         dup     z0.q, z0.q[0]
>         ret
>
> I suppose this is correct, since in both cases it's replicating the
> entire 128-bit vector (irrespective of element sizes) ?

Yeah, the output code will be the same for all cases.

> Thanks,
> Prathamesh
>>
>> > +
>> > +  int npatterns = d->perm.encoding ().npatterns ();
>> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
>> > +    return false;
>> > +
>> > +  for (int i = 0; i < npatterns; i++)
>> > +    if (!known_eq (d->perm[i], i))
>> > +      return false;
>> > +
>> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
>> > +  return true;
>> > +}
>> > +
>> >  /* Try to implement D using SVE SEL instruction.  */
>> >
>> >  static bool
>> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>> >        else if (aarch64_evpc_reencode (d))
>> >       return true;
>> >        if (d->vec_flags == VEC_SVE_DATA)
>> > -     return aarch64_evpc_sve_tbl (d);
>> > +     {
>> > +       if (aarch64_evpc_sve_dup (d))
>> > +         return true;
>> > +       else if (aarch64_evpc_sve_tbl (d))
>> > +         return true;
>> > +     }
>> >        else if (d->vec_flags == VEC_ADVSIMD)
>> >       return aarch64_evpc_tbl (d);
>> >      }
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
>> > new file mode 100644
>> > index 00000000000..35100a9e01c
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
>> > @@ -0,0 +1,17 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3" } */
>> > +
>> > +#include "arm_neon.h"
>> > +#include "arm_sve.h"
>> > +
>> > +svint32_t f1 (int32x4_t x)
>> > +{
>> > +  return svld1rq (svptrue_b8 (), &x[0]);
>> > +}
>> > +
>> > +svint32_t f2 (int *x)
>> > +{
>> > +  return svld1rq (svptrue_b8 (), x);
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index c24c0548724..8a2e5b886e4 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,7 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "ssa.h"
>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1208,59 @@ public:
>      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +    tree arg0 = gimple_call_arg (f.call, 0);
> +    tree arg1 = gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs = svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.
> +       vectype is the corresponding ADVSIMD type.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs = gimple_call_lhs (f.call);
> +	tree lhs_type = TREE_TYPE (lhs);
> +	tree eltype = TREE_TYPE (lhs_type);
> +	unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> +	tree vectype = build_vector_type (eltype, nunits);
> +
> +	tree elt_ptr_type
> +	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
> +	tree zero = build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> +
> +	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> +	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> +	gimple *mem_ref_stmt
> +	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> +	vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> +	for (int i = 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> +						   indices));
> +	tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR,
> +				    mem_ref_lhs, mem_ref_lhs, mask);
> +      }
> +
> +    return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f650abbc4ce..072ec9bd153 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || d->perm.length ().is_constant ()
> +      || !d->one_vector_p
> +      || d->target == NULL
> +      || d->op0 == NULL
> +      || (aarch64_classify_vector_mode (GET_MODE (d->target)) & VEC_ANY_SVE) == 0

This check isn't necessary, since the caller has already checked that
this is an SVE permute.

> +      || (aarch64_classify_vector_mode (GET_MODE (d->op0)) & VEC_ADVSIMD) == 0)
> +    return false;
> +
> +  int npatterns = d->perm.encoding ().npatterns ();
> +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> +    return false;
> +
> +  for (int i = 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>  
>  static bool
> @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>        else if (aarch64_evpc_reencode (d))
>  	return true;
>        if (d->vec_flags == VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> +	{
> +	  if (aarch64_evpc_sve_dup (d))
> +	    return true;
> +	  else if (aarch64_evpc_sve_tbl (d))
> +	    return true;
> +	}
>        else if (d->vec_flags == VEC_ADVSIMD)
>  	return aarch64_evpc_tbl (d);
>      }
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> new file mode 100644
> index 00000000000..5af3b6ed24c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include "arm_neon.h"
> +#include "arm_sve.h"
> +
> +#define TEST(ret_type, param_type, suffix) \
> +ret_type test_##suffix(param_type x) \
> +{ \
> +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> +}
> +
> +TEST(svint8_t, int8x16_t, s8)
> +TEST(svint16_t, int16x8_t, s16)
> +TEST(svint32_t, int32x4_t, s32)
> +TEST(svint64_t, int64x2_t, s64)
> +
> +TEST(svuint8_t, uint8x16_t, u8)
> +TEST(svuint16_t, uint16x8_t, u16)
> +TEST(svuint32_t, uint32x4_t, u32)
> +TEST(svuint64_t, uint64x2_t, u64)
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> new file mode 100644
> index 00000000000..17e78c57c1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include "arm_neon.h"
> +#include "arm_sve.h"
> +
> +#define TEST(ret_type, param_type, suffix) \
> +ret_type test_##suffix(param_type *x) \
> +{ \
> +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> +}
> +
> +TEST(svint8_t, int8_t, s8)
> +TEST(svint16_t, int16_t, s16)
> +TEST(svint32_t, int32_t, s32)
> +TEST(svint64_t, int64_t, s64)
> +
> +TEST(svuint8_t, uint8_t, u8)
> +TEST(svuint16_t, uint16_t, u16)
> +TEST(svuint32_t, uint32_t, u32)
> +TEST(svuint64_t, uint64_t, u64)
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */

It would be good to check the float modes too.

Thanks,
Richard
Prathamesh Kulkarni May 12, 2022, 9:12 a.m. UTC | #7
On Wed, 11 May 2022 at 12:44, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > index c24c0548724..1ef4ea2087b 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > @@ -44,6 +44,14 @@
> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >  #include "aarch64-sve-builtins-base.h"
> >> >  #include "aarch64-sve-builtins-functions.h"
> >> > +#include "aarch64-builtins.h"
> >> > +#include "gimple-ssa.h"
> >> > +#include "tree-phinodes.h"
> >> > +#include "tree-ssa-operands.h"
> >> > +#include "ssa-iterators.h"
> >> > +#include "stringpool.h"
> >> > +#include "value-range.h"
> >> > +#include "tree-ssanames.h"
> >>
> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> rather than include some of these headers directly.
> >>
> >> >
> >> >  using namespace aarch64_sve;
> >> >
> >> > @@ -1207,6 +1215,56 @@ public:
> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >> >      return e.use_contiguous_load_insn (icode);
> >> >    }
> >> > +
> >> > +  gimple *
> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> > +  {
> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
> >> > +
> >> > +    /* Transform:
> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> >> > +       into:
> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> >> > +       on little endian target.  */
> >> > +
> >> > +    if (!BYTES_BIG_ENDIAN
> >> > +     && integer_all_onesp (arg0))
> >> > +      {
> >> > +     tree lhs = gimple_call_lhs (f.call);
> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >>
> >> Does this work for other element sizes?  I would have expected it
> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> element type as the SVE vector.
> >>
> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> just to be sure.
> > In the attached patch, it obtains corresponding advsimd type with:
> >
> > tree eltype = TREE_TYPE (lhs_type);
> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> > tree vectype = build_vector_type (eltype, nunits);
> >
> > While this seems to work with different element sizes, I am not sure if it's
> > the correct approach ?
>
> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> to get the vector mode associated with a .Q “element”, so an
> alternative would be:
>
>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>
> which is more explicit about wanting an Advanced SIMD vector.
>
> >> > +
> >> > +     tree elt_ptr_type
> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> >> > +     tree zero = build_zero_cst (elt_ptr_type);
> >> > +
> >> > +     /* Use element type alignment.  */
> >> > +     tree access_type
> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> >> > +
> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> > +     gimple *mem_ref_stmt
> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
> >>
> >> Long line.  Might be easier to format by assigning the fold_build2 result
> >> to a temporary variable.
> >>
> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> > +
> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
> >> > +     tree lhs_type = TREE_TYPE (lhs);
> >>
> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> have expected them to change during the build process.
> >>
> >> > +
> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> >> > +     for (int i = 0; i < source_nelts; i++)
> >> > +       sel.quick_push (i);
> >> > +
> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
> >>
> >> Nit: long line.
> >>
> >> > +      }
> >> > +
> >> > +    return NULL;
> >> > +  }
> >> >  };
> >> >
> >> >  class svld1ro_impl : public load_replicate
> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> > index f650abbc4ce..47810fec804 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >> >    return true;
> >> >  }
> >> >
> >> > +/* Try to implement D using SVE dup instruction.  */
> >> > +
> >> > +static bool
> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> >> > +{
> >> > +  if (BYTES_BIG_ENDIAN
> >> > +      || d->perm.length ().is_constant ()
> >> > +      || !d->one_vector_p
> >> > +      || d->target == NULL
> >> > +      || d->op0 == NULL
> >>
> >> These last two lines mean that we always return false for d->testing.
> >> The idea instead is that the return value should be the same for both
> >> d->testing and !d->testing.  The difference is that for !d->testing we
> >> also emit code to do the permute.
>
> It doesn't look like the new patch addresses this.  There should be
> no checks for/uses of “d->target” and “d->op0” until after:
>
>   if (d->testing_p)
>     return true;
>
> This...
>
> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
> >>
> >> Sorry, I've forgotten the context now, but: these positive tests
> >> for is_constant surprised me.  Do we really only want to do this
> >> for variable-length SVE code generation, rather than fixed-length?
> >>
> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> >> > +    return false;
> >> > +
> >> > +  if (d->testing_p)
> >> > +    return true;
> >>
> >> This should happen after the later tests, once we're sure that the
> >> permute vector has the right form.  If the issue is that op0 isn't
> >> provided for testing then I think the hook needs to be passed the
> >> input mode alongside the result mode.
>
> ...was my guess about why the checks were there.
Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
In that case, how do we obtain input mode ?

Thanks,
Prathamesh

>
> >> It might then be better to test:
> >>
> >>   aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD
> >>
> >> (despite what I said earlier, about testing is_constant, sorry).
> > Thanks for the suggestions, I tried to address them in the attached patch.
> > Does it look OK after bootstrap+test ?
> >
> > The patch seems to generate the same code for different vector types.
> > For eg:
> >
> > svint32_t foo (int32x4_t x)
> > {
> >   return svld1rq (svptrue_b8 (), &x[0]);
> > }
> >
> > svint16_t foo2(int16x8_t x)
> > {
> >   return svld1rq_s16 (svptrue_b8 (), &x[0]);
> > }
> >
> > .optimized dump:
> > ;; Function foo (foo, funcdef_no=4350, decl_uid=29928,
> > cgraph_uid=4351, symbol_order=4350)
> > svint32_t foo (int32x4_t x)
> > {
> >   svint32_t _2;
> >
> >   <bb 2> [local count: 1073741824]:
> >   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, ... }>;
> >   return _2;
> >
> > }
> >
> > ;; Function foo2 (foo2, funcdef_no=4351, decl_uid=29931,
> > cgraph_uid=4352, symbol_order=4351)
> >
> > svint16_t foo2 (int16x8_t x)
> > {
> >   svint16_t _2;
> >
> >   <bb 2> [local count: 1073741824]:
> >   _2 = VEC_PERM_EXPR <x_3(D), x_3(D), { 0, 1, 2, 3, 4, 5, 6, 7, ... }>;
> >   return _2;
> >
> > }
> >
> > resulting in code-gen:
> > foo:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > foo2:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > I suppose this is correct, since in both cases it's replicating the
> > entire 128-bit vector (irrespective of element sizes) ?
>
> Yeah, the output code will be the same for all cases.
>
> > Thanks,
> > Prathamesh
> >>
> >> > +
> >> > +  int npatterns = d->perm.encoding ().npatterns ();
> >> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> >> > +    return false;
> >> > +
> >> > +  for (int i = 0; i < npatterns; i++)
> >> > +    if (!known_eq (d->perm[i], i))
> >> > +      return false;
> >> > +
> >> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> >> > +  return true;
> >> > +}
> >> > +
> >> >  /* Try to implement D using SVE SEL instruction.  */
> >> >
> >> >  static bool
> >> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >> >        else if (aarch64_evpc_reencode (d))
> >> >       return true;
> >> >        if (d->vec_flags == VEC_SVE_DATA)
> >> > -     return aarch64_evpc_sve_tbl (d);
> >> > +     {
> >> > +       if (aarch64_evpc_sve_dup (d))
> >> > +         return true;
> >> > +       else if (aarch64_evpc_sve_tbl (d))
> >> > +         return true;
> >> > +     }
> >> >        else if (d->vec_flags == VEC_ADVSIMD)
> >> >       return aarch64_evpc_tbl (d);
> >> >      }
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> >> > new file mode 100644
> >> > index 00000000000..35100a9e01c
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
> >> > @@ -0,0 +1,17 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3" } */
> >> > +
> >> > +#include "arm_neon.h"
> >> > +#include "arm_sve.h"
> >> > +
> >> > +svint32_t f1 (int32x4_t x)
> >> > +{
> >> > +  return svld1rq (svptrue_b8 (), &x[0]);
> >> > +}
> >> > +
> >> > +svint32_t f2 (int *x)
> >> > +{
> >> > +  return svld1rq (svptrue_b8 (), x);
> >> > +}
> >> > +
> >> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index c24c0548724..8a2e5b886e4 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,7 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "ssa.h"
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1208,59 @@ public:
> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >      return e.use_contiguous_load_insn (icode);
> >    }
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +    tree arg0 = gimple_call_arg (f.call, 0);
> > +    tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +    /* Transform:
> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
> > +       into:
> > +       tmp = mem_ref<vectype> [(int * {ref-all}) arg1]
> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> > +       on little endian target.
> > +       vectype is the corresponding ADVSIMD type.  */
> > +
> > +    if (!BYTES_BIG_ENDIAN
> > +     && integer_all_onesp (arg0))
> > +      {
> > +     tree lhs = gimple_call_lhs (f.call);
> > +     tree lhs_type = TREE_TYPE (lhs);
> > +     tree eltype = TREE_TYPE (lhs_type);
> > +     unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> > +     tree vectype = build_vector_type (eltype, nunits);
> > +
> > +     tree elt_ptr_type
> > +       = build_pointer_type_for_mode (eltype, VOIDmode, true);
> > +     tree zero = build_zero_cst (elt_ptr_type);
> > +
> > +     /* Use element type alignment.  */
> > +     tree access_type
> > +       = build_aligned_type (vectype, TYPE_ALIGN (eltype));
> > +
> > +     tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
> > +     tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
> > +     gimple *mem_ref_stmt
> > +       = gimple_build_assign (mem_ref_lhs, mem_ref_op);
> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> > +     for (int i = 0; i < source_nelts; i++)
> > +       sel.quick_push (i);
> > +
> > +     vec_perm_indices indices (sel, 1, source_nelts);
> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> > +                                                indices));
> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR,
> > +                                 mem_ref_lhs, mem_ref_lhs, mask);
> > +      }
> > +
> > +    return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index f650abbc4ce..072ec9bd153 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >    return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +      || d->perm.length ().is_constant ()
> > +      || !d->one_vector_p
> > +      || d->target == NULL
> > +      || d->op0 == NULL
> > +      || (aarch64_classify_vector_mode (GET_MODE (d->target)) & VEC_ANY_SVE) == 0
>
> This check isn't necessary, since the caller has already checked that
> this is an SVE permute.
>
> > +      || (aarch64_classify_vector_mode (GET_MODE (d->op0)) & VEC_ADVSIMD) == 0)
> > +    return false;
> > +
> > +  int npatterns = d->perm.encoding ().npatterns ();
> > +  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
> > +    return false;
> > +
> > +  for (int i = 0; i < npatterns; i++)
> > +    if (!known_eq (d->perm[i], i))
> > +      return false;
> > +
> > +  if (d->testing_p)
> > +    return true;
> > +
> > +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > +  return true;
> > +}
> > +
> >  /* Try to implement D using SVE SEL instruction.  */
> >
> >  static bool
> > @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> >        else if (aarch64_evpc_reencode (d))
> >       return true;
> >        if (d->vec_flags == VEC_SVE_DATA)
> > -     return aarch64_evpc_sve_tbl (d);
> > +     {
> > +       if (aarch64_evpc_sve_dup (d))
> > +         return true;
> > +       else if (aarch64_evpc_sve_tbl (d))
> > +         return true;
> > +     }
> >        else if (d->vec_flags == VEC_ADVSIMD)
> >       return aarch64_evpc_tbl (d);
> >      }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> > new file mode 100644
> > index 00000000000..5af3b6ed24c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
> > @@ -0,0 +1,23 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include "arm_neon.h"
> > +#include "arm_sve.h"
> > +
> > +#define TEST(ret_type, param_type, suffix) \
> > +ret_type test_##suffix(param_type x) \
> > +{ \
> > +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> > +}
> > +
> > +TEST(svint8_t, int8x16_t, s8)
> > +TEST(svint16_t, int16x8_t, s16)
> > +TEST(svint32_t, int32x4_t, s32)
> > +TEST(svint64_t, int64x2_t, s64)
> > +
> > +TEST(svuint8_t, uint8x16_t, u8)
> > +TEST(svuint16_t, uint16x8_t, u16)
> > +TEST(svuint32_t, uint32x4_t, u32)
> > +TEST(svuint64_t, uint64x2_t, u64)
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> > new file mode 100644
> > index 00000000000..17e78c57c1b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
> > @@ -0,0 +1,23 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include "arm_neon.h"
> > +#include "arm_sve.h"
> > +
> > +#define TEST(ret_type, param_type, suffix) \
> > +ret_type test_##suffix(param_type *x) \
> > +{ \
> > +  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
> > +}
> > +
> > +TEST(svint8_t, int8_t, s8)
> > +TEST(svint16_t, int16_t, s16)
> > +TEST(svint32_t, int32_t, s32)
> > +TEST(svint64_t, int64_t, s64)
> > +
> > +TEST(svuint8_t, uint8_t, u8)
> > +TEST(svuint16_t, uint16_t, u16)
> > +TEST(svuint32_t, uint32_t, u32)
> > +TEST(svuint64_t, uint64_t, u64)
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 8 { target aarch64_little_endian } } } */
>
> It would be good to check the float modes too.
>
> Thanks,
> Richard
Richard Sandiford May 12, 2022, 10:44 a.m. UTC | #8
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Wed, 11 May 2022 at 12:44, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > index c24c0548724..1ef4ea2087b 100644
>> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > @@ -44,6 +44,14 @@
>> >> >  #include "aarch64-sve-builtins-shapes.h"
>> >> >  #include "aarch64-sve-builtins-base.h"
>> >> >  #include "aarch64-sve-builtins-functions.h"
>> >> > +#include "aarch64-builtins.h"
>> >> > +#include "gimple-ssa.h"
>> >> > +#include "tree-phinodes.h"
>> >> > +#include "tree-ssa-operands.h"
>> >> > +#include "ssa-iterators.h"
>> >> > +#include "stringpool.h"
>> >> > +#include "value-range.h"
>> >> > +#include "tree-ssanames.h"
>> >>
>> >> Minor, but: I think the preferred approach is to include "ssa.h"
>> >> rather than include some of these headers directly.
>> >>
>> >> >
>> >> >  using namespace aarch64_sve;
>> >> >
>> >> > @@ -1207,6 +1215,56 @@ public:
>> >> >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >> >      return e.use_contiguous_load_insn (icode);
>> >> >    }
>> >> > +
>> >> > +  gimple *
>> >> > +  fold (gimple_folder &f) const OVERRIDE
>> >> > +  {
>> >> > +    tree arg0 = gimple_call_arg (f.call, 0);
>> >> > +    tree arg1 = gimple_call_arg (f.call, 1);
>> >> > +
>> >> > +    /* Transform:
>> >> > +       lhs = svld1rq ({-1, -1, ... }, arg1)
>> >> > +       into:
>> >> > +       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
>> >> > +       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
>> >> > +       on little endian target.  */
>> >> > +
>> >> > +    if (!BYTES_BIG_ENDIAN
>> >> > +     && integer_all_onesp (arg0))
>> >> > +      {
>> >> > +     tree lhs = gimple_call_lhs (f.call);
>> >> > +     auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>> >>
>> >> Does this work for other element sizes?  I would have expected it
>> >> to be the (128-bit) Advanced SIMD vector associated with the same
>> >> element type as the SVE vector.
>> >>
>> >> The testcase should cover more than just int32x4_t -> svint32_t,
>> >> just to be sure.
>> > In the attached patch, it obtains corresponding advsimd type with:
>> >
>> > tree eltype = TREE_TYPE (lhs_type);
>> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
>> > tree vectype = build_vector_type (eltype, nunits);
>> >
>> > While this seems to work with different element sizes, I am not sure if it's
>> > the correct approach ?
>>
>> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
>> to get the vector mode associated with a .Q “element”, so an
>> alternative would be:
>>
>>     machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
>>     tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>>
>> which is more explicit about wanting an Advanced SIMD vector.
>>
>> >> > +
>> >> > +     tree elt_ptr_type
>> >> > +       = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
>> >> > +     tree zero = build_zero_cst (elt_ptr_type);
>> >> > +
>> >> > +     /* Use element type alignment.  */
>> >> > +     tree access_type
>> >> > +       = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
>> >> > +
>> >> > +     tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> >> > +     gimple *mem_ref_stmt
>> >> > +       = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero));
>> >>
>> >> Long line.  Might be easier to format by assigning the fold_build2 result
>> >> to a temporary variable.
>> >>
>> >> > +     gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> >> > +
>> >> > +     tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> >> > +     tree vectype = TREE_TYPE (mem_ref_lhs);
>> >> > +     tree lhs_type = TREE_TYPE (lhs);
>> >>
>> >> Is this necessary?  The code above supplied the types and I wouldn't
>> >> have expected them to change during the build process.
>> >>
>> >> > +
>> >> > +     int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> >> > +     vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
>> >> > +     for (int i = 0; i < source_nelts; i++)
>> >> > +       sel.quick_push (i);
>> >> > +
>> >> > +     vec_perm_indices indices (sel, 1, source_nelts);
>> >> > +     gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices));
>> >> > +     tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> >> > +     return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask);
>> >>
>> >> Nit: long line.
>> >>
>> >> > +      }
>> >> > +
>> >> > +    return NULL;
>> >> > +  }
>> >> >  };
>> >> >
>> >> >  class svld1ro_impl : public load_replicate
>> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> >> > index f650abbc4ce..47810fec804 100644
>> >> > --- a/gcc/config/aarch64/aarch64.cc
>> >> > +++ b/gcc/config/aarch64/aarch64.cc
>> >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >> >    return true;
>> >> >  }
>> >> >
>> >> > +/* Try to implement D using SVE dup instruction.  */
>> >> > +
>> >> > +static bool
>> >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> >> > +{
>> >> > +  if (BYTES_BIG_ENDIAN
>> >> > +      || d->perm.length ().is_constant ()
>> >> > +      || !d->one_vector_p
>> >> > +      || d->target == NULL
>> >> > +      || d->op0 == NULL
>> >>
>> >> These last two lines mean that we always return false for d->testing.
>> >> The idea instead is that the return value should be the same for both
>> >> d->testing and !d->testing.  The difference is that for !d->testing we
>> >> also emit code to do the permute.
>>
>> It doesn't look like the new patch addresses this.  There should be
>> no checks for/uses of “d->target” and “d->op0” until after:
>>
>>   if (d->testing_p)
>>     return true;
>>
>> This...
>>
>> >> > +      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>> >>
>> >> Sorry, I've forgotten the context now, but: these positive tests
>> >> for is_constant surprised me.  Do we really only want to do this
>> >> for variable-length SVE code generation, rather than fixed-length?
>> >>
>> >> > +      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
>> >> > +    return false;
>> >> > +
>> >> > +  if (d->testing_p)
>> >> > +    return true;
>> >>
>> >> This should happen after the later tests, once we're sure that the
>> >> permute vector has the right form.  If the issue is that op0 isn't
>> >> provided for testing then I think the hook needs to be passed the
>> >> input mode alongside the result mode.
>>
>> ...was my guess about why the checks were there.
> Ah right sorry. IIUC, if d->testing is true, then d->op0 could be NULL ?
> In that case, how do we obtain input mode ?

Well, like I say, I think we might need to extend the vec_perm_const
hook interface so that it gets passed the input mode, now that that
isn't necessarily the same as the output mode.

It would be good to do that as a separate prepatch, since it would
affect other targets too.  And for safety, that patch should make all
existing implementations of the hook return false if the modes aren't
equal, including for aarch64.  The current patch can then make the
aarch64 hook treat the dup case as an exception.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 02e42a71e5e..e21bbec360c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1207,6 +1207,56 @@  public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, &v[0])
+       into:
+       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0)
+	&& TREE_CODE (arg1) == ADDR_EXPR)
+      {
+	tree t = TREE_OPERAND (arg1, 0);
+	if (TREE_CODE (t) == ARRAY_REF)
+	  {
+	    tree index = TREE_OPERAND (t, 1);
+	    t = TREE_OPERAND (t, 0);
+	    if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
+	      {
+		t = TREE_OPERAND (t, 0);
+		tree vectype = TREE_TYPE (t);
+		if (VECTOR_TYPE_P (vectype)
+		    && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
+		    && wi::to_wide (TYPE_SIZE (vectype)) == 128)
+		  {
+		    tree lhs = gimple_call_lhs (f.call);
+		    tree lhs_type = TREE_TYPE (lhs);
+		    int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+		    vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+		    for (int i = 0; i < source_nelts; i++)
+		      sel.quick_push (i);
+
+		    vec_perm_indices indices (sel, 1, source_nelts);
+		    if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
+		      return NULL;
+
+		    tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+		    return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
+		  }
+	      }
+	  }
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f07330cff4f..af27f550be3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23002,8 +23002,32 @@  aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
 
   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
+
   if (d->one_vector_p)
-    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+    {
+      bool use_dupq = false;
+      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts}  */
+      if (GET_CODE (sel) == CONST_VECTOR
+	  && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
+	  && CONST_VECTOR_DUPLICATE_P (sel))
+	  {
+	    unsigned nelts = const_vector_encoded_nelts (sel);
+	    unsigned i;
+	    for (i = 0; i < nelts; i++)
+	      {
+		rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
+		if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
+		  break;
+	      }
+	    if (i == nelts)
+	      use_dupq = true;
+	  }
+
+      if (use_dupq)
+	aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+      else
+	emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+    }
   else
     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
   return true;