[3/3] aarch64: Add SVE support for simd clones [PR 96342]

Message ID 20240130143132.9575-4-andre.simoesdiasvieira@arm.com
State New
Headers
Series vect, aarch64: Add SVE support for simdclones |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Testing failed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 fail Testing failed

Commit Message

Andre Vieira (lists) Jan. 30, 2024, 2:31 p.m. UTC
  This patch finalizes adding support for the generation of SVE simd clones when
no simdlen is provided, following the ABI rules where the widest data type
determines the minimum amount of elements in a length agnostic vector.

gcc/ChangeLog:

	* config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare.
	* config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
	visibility global and support use for non_acle types.
	* config/aarch64/aarch64.cc
	(aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
	when no simdlen is provided, according to ABI rules.
	(simd_clone_adjust_sve_vector_type): New helper function.
	(aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones
	and modify types to use SVE types.
	* omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
	(simd_clone_adjust): Adapt safelen check to be compatible with VLA
	simdlen.

gcc/testsuite/ChangeLog:

	* c-c++-common/gomp/declare-variant-14.c: Make i?86 and x86_64 target
	only test.
	* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
	* gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
	* gcc.target/aarch64/vect-simd-clone-1.c: New test.
  

Comments

Richard Sandiford Feb. 1, 2024, 9:59 p.m. UTC | #1
Andre Vieira <andre.simoesdiasvieira@arm.com> writes:
> This patch finalizes adding support for the generation of SVE simd clones when
> no simdlen is provided, following the ABI rules where the widest data type
> determines the minimum amount of elements in a length agnostic vector.
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare.
> 	* config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
> 	visibility global and support use for non_acle types.
> 	* config/aarch64/aarch64.cc
> 	(aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
> 	when no simdlen is provided, according to ABI rules.
> 	(simd_clone_adjust_sve_vector_type): New helper function.
> 	(aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones
> 	and modify types to use SVE types.
> 	* omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
> 	(simd_clone_adjust): Adapt safelen check to be compatible with VLA
> 	simdlen.
>
> gcc/testsuite/ChangeLog:
>
> 	* c-c++-common/gomp/declare-variant-14.c: Make i?86 and x86_64 target
> 	only test.
> 	* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
> 	* gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
> 	* gcc.target/aarch64/vect-simd-clone-1.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index a0b142e0b94..207396de0ff 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1031,6 +1031,8 @@ namespace aarch64_sve {
>  #ifdef GCC_TARGET_H
>    bool verify_type_context (location_t, type_context_kind, const_tree, bool);
>  #endif
> + void add_sve_type_attribute (tree, unsigned int, unsigned int,
> +			      const char *, const char *);
>  }
>  
>  extern void aarch64_split_combinev16qi (rtx operands[3]);
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index 11f5c5c500c..747131e684e 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -953,14 +953,16 @@ static bool reported_missing_registers_p;
>  /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
>     and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
>     mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
> -static void
> +void
>  add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
>  			const char *mangled_name, const char *acle_name)
>  {
>    tree mangled_name_tree
>      = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
> +  tree acle_name_tree
> +    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
>  
> -  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
> +  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
>    value = tree_cons (NULL_TREE, mangled_name_tree, value);
>    value = tree_cons (NULL_TREE, size_int (num_pr), value);
>    value = tree_cons (NULL_TREE, size_int (num_zr), value);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 31617510160..cba8879ab33 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -28527,7 +28527,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>  					int num, bool explicit_p)
>  {
>    tree t, ret_type;
> -  unsigned int nds_elt_bits;
> +  unsigned int nds_elt_bits, wds_elt_bits;
>    unsigned HOST_WIDE_INT const_simdlen;
>  
>    if (!TARGET_SIMD)
> @@ -28572,10 +28572,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>    if (TREE_CODE (ret_type) != VOID_TYPE)
>      {
>        nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
> +      wds_elt_bits = nds_elt_bits;
>        vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
>      }
>    else
> -    nds_elt_bits = POINTER_SIZE;
> +    {
> +      nds_elt_bits = POINTER_SIZE;
> +      wds_elt_bits = 0;
> +    }
>  
>    int i;
>    tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
> @@ -28583,44 +28587,72 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>    for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
>         t && t != void_list_node; t = TREE_CHAIN (t), i++)
>      {
> -      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
> +      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
>        if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
> -	  && !supported_simd_type (arg_type))
> +	  && !supported_simd_type (type))
>  	{
>  	  if (!explicit_p)
>  	    ;
> -	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
> +	  else if (COMPLEX_FLOAT_TYPE_P (type))
>  	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>  			"GCC does not currently support argument type %qT "
> -			"for simd", arg_type);
> +			"for simd", type);
>  	  else
>  	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>  			"unsupported argument type %qT for simd",
> -			arg_type);
> +			type);
>  	  return 0;
>  	}
> -      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
> +      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
>        if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
> -	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
> +	vec_elts.safe_push (std::make_pair (type, lane_bits));
>        if (nds_elt_bits > lane_bits)
>  	nds_elt_bits = lane_bits;
> +      if (wds_elt_bits < lane_bits)
> +	wds_elt_bits = lane_bits;
>      }
>  
> -  clonei->vecsize_mangle = 'n';
> +  /* If we could not determine the WDS type from available parameters/return,
> +     then fallback to using uintptr_t.  */
> +  if (wds_elt_bits == 0)
> +    wds_elt_bits = POINTER_SIZE;
> +
>    clonei->mask_mode = VOIDmode;
>    poly_uint64 simdlen;
> -  auto_vec<poly_uint64> simdlens (2);
> +  auto_vec<poly_uint64> simdlens (3);
> +  auto_vec<char> simdmangle (3);

Minor, but I think it'd be neater to use an ad-hoc structure that
contains the mangling prefix and simdlen together, so that only one
vector is needed.  Brace initialization should make it a bit shorter too.

>    /* Keep track of the possible simdlens the clones of this function can have,
>       and check them later to see if we support them.  */
>    if (known_eq (clonei->simdlen, 0U))
>      {
>        simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
>        if (maybe_ne (simdlen, 1U))
> -	simdlens.safe_push (simdlen);
> +	{
> +	  simdlens.safe_push (simdlen);
> +	  simdmangle.safe_push ('n');
> +	}
>        simdlens.safe_push (simdlen * 2);
> +      simdmangle.safe_push ('n');
> +      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
> +	 function.
> +	We have also disabled support for creating SVE simdclones for functions
> +	with function bodies and any simdclones when -msve-vector-bits is used.
> +	TODO: add support for these.  */
> +      if ((DECL_ARGUMENTS (node->decl) != 0
> +	   || type_arg_types != 0)

I think my comment from the previous review still stands:

  This check feels a bit indirect.  Does it work to use:

    if (prototype_p (TREE_TYPE (node->decl)))

  instead?

Or does that not work?

> +	  && !node->definition
> +	  && !aarch64_sve_vg.is_constant ())
> +	{
> +	  poly_uint64 sve_simdlen = aarch64_sve_vg * 64;
> +	  simdlens.safe_push (exact_div (sve_simdlen, wds_elt_bits));

Simpler as:

	  simdlens.safe_push (exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits));

> +	  simdmangle.safe_push ('s');
> +	}
>      }
>    else
> -    simdlens.safe_push (clonei->simdlen);
> +    {
> +      simdlens.safe_push (clonei->simdlen);
> +      simdmangle.safe_push ('n');
> +    }
>  
>    clonei->vecsize_int = 0;
>    clonei->vecsize_float = 0;
> @@ -28638,7 +28670,8 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>      {
>        bool remove_simdlen = false;
>        for (auto elt : vec_elts)
> -	if (known_gt (simdlens[j] * elt.second, 128U))
> +	if (simdmangle[j] == 'n'
> +	    && known_gt (simdlens[j] * elt.second, 128U))
>  	  {
>  	    /* Don't issue a warning for every simdclone when there is no
>  	       specific simdlen clause.  */
> @@ -28651,12 +28684,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>  	    break;
>  	  }
>        if (remove_simdlen)
> -	simdlens.ordered_remove (j);
> +	{
> +	  simdlens.ordered_remove (j);
> +	  simdmangle.ordered_remove (j);
> +	}
>        else
>  	j++;
>      }
>  
> -
>    int count = simdlens.length ();
>    if (count == 0)
>      {
> @@ -28675,20 +28710,107 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>  
>    gcc_assert (num < count);
>    clonei->simdlen = simdlens[num];
> +  clonei->vecsize_mangle = simdmangle[num];
> +  /* SVE simdclones always have a Mask, so set inbranch to 1.  */
> +  if (clonei->vecsize_mangle == 's')
> +    clonei->inbranch = 1;
>    return count;
>  }
>  
> +static tree
> +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
> +{
> +    unsigned int num_zr = 0;

From the previous review:

  Nits: missing function comment.  The body is indented by too many columns.

> +    unsigned int num_pr = 0;
> +    machine_mode vector_mode;
> +    type = TREE_TYPE (type);
> +    scalar_mode scalar_m = as_a <scalar_mode> (TYPE_MODE (type));

SCALAR_TYPE_MODE

> +    gcc_assert (aarch64_sve_data_mode (scalar_m,
> +				       simdlen).exists (&vector_mode));

Better to use require () instead, since gcc_asserts can be compiled out.

> +    type = build_vector_type_for_mode (type, vector_mode);
> +    if (is_mask)
> +      {
> +	type = truth_type_for (type);
> +	num_pr = 1;
> +      }
> +    else
> +      num_zr = 1;
> +
> +    aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL,
> +					 NULL);

The comment from my previous review still stands:

  Before adding the atttribute, I think we should call:

    type = build_distinct_type_copy (type);

  so that we don't change a type that is already in use, or associate
  any new types with this one.

I think it'd also be worth adding a comment to say why we take this
approach instead of reusing ACLE types.  (The reason being that we need
to handle unpacked vectors as well, which the ACLE doesn't provide.)

> +    return type;
> +}
> +
>  /* Implement TARGET_SIMD_CLONE_ADJUST.  */
>  
>  static void
>  aarch64_simd_clone_adjust (struct cgraph_node *node)
>  {
> -  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
> -     use the correct ABI.  */
> -
>    tree t = TREE_TYPE (node->decl);
> -  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
> -					TYPE_ATTRIBUTES (t));
> +  cl_target_option cur_target;
> +  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
> +
> +  if (node->simdclone->vecsize_mangle == 's')
> +    {
> +      tree target = build_string (strlen ("+sve"), "+sve");

Probably worth adding a comment here to say (as you noted in the reply
to the last review) that this is additive and has no effect if SVE (or
higher) is already enabled.

> +      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);

I still think it'd be better to assert that this succeeds (via
a gcc_unreachable).  It looks weird to call a _p function and not test
the result.

> +      cl_target_option_save (&cur_target, &global_options, &global_options_set);
> +      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
> +      cl_target_option_restore (&global_options, &global_options_set,
> +				TREE_TARGET_OPTION (new_target));
> +      aarch64_override_options_internal (&global_options);
> +      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
> +	      sizeof (have_regs_of_mode));
> +      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
> +	if (aarch64_sve_mode_p ((machine_mode) i))
> +	  have_regs_of_mode[i] = true;

Sorry, just realised I never replied to your question about the
push_cfun/pop_cfun suggestion.  I think the function we'd push is
node->decl, i.e. the one that received the +sve target attribute.

I.e. could we do:

    push_cfun (node->decl);

after aarch64_option_valid_attribute_p and skip the rest?  Then do
pop_cfun as the restoration step.

Does the above work with the:

  /* If what we're processing is the current pragma string then the
     target option node is already stored in target_option_current_node
     by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
     having to re-parse the string.  This is especially useful to keep
     arm_neon.h compile times down since that header contains a lot
     of intrinsics enclosed in pragmas.  */
  if (!existing_target && args == current_target_pragma)

shortcut in aarch64_override_options_internal?  I have no particular
reason to believe that it wouldn't, just wanted to check...

> +    }
> +  else
> +    {
> +	/* Add aarch64_vector_pcs target attribute to SIMD clones so they
> +	   use the correct ABI.  */
> +	TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
> +					      TYPE_ATTRIBUTES (t));
> +    }
> +  cgraph_simd_clone *sc = node->simdclone;
> +
> +  for (unsigned i = 0; i < sc->nargs; ++i)
> +    {
> +      bool is_mask = false;
> +      tree type;
> +      switch (sc->args[i].arg_type)
> +	{
> +	case SIMD_CLONE_ARG_TYPE_MASK:
> +	  is_mask = true;
> +	  gcc_fallthrough ();
> +	case SIMD_CLONE_ARG_TYPE_VECTOR:
> +	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
> +	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
> +	  type = sc->args[i].vector_type;
> +	  gcc_assert (VECTOR_TYPE_P (type));
> +	  if (node->simdclone->vecsize_mangle == 's')
> +	    type = simd_clone_adjust_sve_vector_type (type, is_mask,
> +						      sc->simdlen);
> +	  else if (is_mask)
> +	    type = truth_type_for (type);
> +	  sc->args[i].vector_type = type;

Probably best to add a break here (or a fall-through if you prefer).

> +	default:
> +	    continue;

Nit: over-indented continue.  But it might as well be a break.

> +	}
> +    }
> +  if (node->simdclone->vecsize_mangle == 's')
> +    {
> +      tree ret_type = TREE_TYPE (t);
> +      if (VECTOR_TYPE_P (ret_type))
> +	TREE_TYPE (t)
> +	  = simd_clone_adjust_sve_vector_type (ret_type, false,
> +					       node->simdclone->simdlen);
> +      /* Restore current options.  */
> +      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
> +      aarch64_override_options_internal (&global_options);
> +      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
> +	      sizeof (have_regs_of_mode));
> +    }
>  }
>  
>  /* Implement TARGET_SIMD_CLONE_USABLE.  */
> @@ -28705,6 +28827,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info stmt_vinfo)
>  	  && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))))
>  	return -1;
>        return 0;
> +    case 's':
> +      if (!TARGET_SVE)
> +	return -1;
> +      return 0;
>      default:
>        gcc_unreachable ();
>      }
> diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
> index 864586207ee..066b6217253 100644
> --- a/gcc/omp-simd-clone.cc
> +++ b/gcc/omp-simd-clone.cc
> @@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node,
>    pp_string (&pp, "_ZGV");
>    pp_character (&pp, vecsize_mangle);
>    pp_character (&pp, mask);
> -  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
> -  unsigned int len = simdlen.to_constant ();
> -  pp_decimal_int (&pp, (len));
> +
> +  unsigned long long len = 0;

unsigned HOST_WIDE_INT

> +  if (simdlen.is_constant (&len))
> +    pp_decimal_int (&pp, (int) (len));
> +  else
> +    pp_character (&pp, 'x');
>  
>    for (n = 0; n < clone_info->nargs; ++n)
>      {
> @@ -1533,8 +1536,8 @@ simd_clone_adjust (struct cgraph_node *node)
>  	 below).  */
>        loop = alloc_loop ();
>        cfun->has_force_vectorize_loops = true;
> -      /* For now, simlen is always constant.  */
> -      loop->safelen = node->simdclone->simdlen.to_constant ();
> +      /* We can assert that safelen is the 'minimum' simdlen.  */
> +      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
>        loop->force_vectorize = true;
>        loop->header = body_bb;
>      }
> diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
> index e3668893afe..2b71869787e 100644
> --- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
> +++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
> @@ -1,6 +1,6 @@
> -/* { dg-do compile { target vect_simd_clones } } */
> +/* { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */
>  /* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized" } */
> -/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */
> +/* { dg-additional-options "-mno-sse3" } */

Please get Jakub's OK for this part.  Similarly for the Fortran test.

>  
>  int f01 (int);
>  int f02 (int);
> @@ -15,15 +15,13 @@ int
>  test1 (int x)
>  {
>    /* At gimplification time, we can't decide yet which function to call.  */
> -  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target { !aarch64*-*-* } } } } */
> +  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
>    /* After simd clones are created, the original non-clone test1 shall
>       call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
>       shall call f01 with score 8.  */
>    /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */
> -  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */
> -  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */
> -  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */
> -  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */
> +  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" } } */
> +  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" } } */
>    int a = f04 (x);
>    int b = f04 (x);
>    return a + b;

This part I feel safer with :)

> diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> index e2e80f0c663..2f4d3a866e5 100644
> --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> @@ -43,6 +43,7 @@ float f04 (double a)
>  }
>  /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
>  /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
> +/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
>  
>  #pragma omp declare simd uniform(a) linear (b)
>  void f05 (short a, short *b, short c)
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
> new file mode 100644
> index 00000000000..71fd361acec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile }  */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a+sve -mcpu=neoverse-n2" } */
> +extern int __attribute__ ((simd, const)) fn0 (int);
> +
> +void test_fn0 (int *a, int *b, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] += fn0 (b[i]);
> +}
> +
> +/* { dg-final { scan-assembler {_ZGVsMxv_fn0} } } */
> +
> +extern int __attribute__ ((simd, const)) fn1 (short, int);
> +
> +void test_fn1 (int *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = fn1 (c[i], b[i]);
> +}
> +
> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn1} } } */
> +
> +extern short __attribute__ ((simd, const)) fn2 (short, int);
> +
> +void test_fn2 (short *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = fn2 (c[i], b[i]);
> +}
> +
> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn2} } } */
> +
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn3} } } */
> +
> +extern short __attribute__ ((simd, const)) fn4 (int, short);
> +
> +void test_fn4 (int *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn4} } } */

It'd be nice to have some more specific testing here.  Although there
are 5 different signatures, the last 4 are interchangeable as far as
the test goes.  E.g. maybe it would be possible to have some partial
check-function-bodies tests that match the inner loop.  Do we
use extending loads for the unpacked vectors?  (Hope so.)

Thanks,
Richard

> diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
> index 6319df0558f..3c7d093c5c6 100644
> --- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
> +++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
> @@ -1,6 +1,6 @@
> -! { dg-do compile { target vect_simd_clones } }
> +! { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */
>  ! { dg-additional-options "-O0 -fdump-tree-gimple -fdump-tree-optimized" }
> -! { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } }
> +! { dg-additional-options "-mno-sse3" }
>  
>  module main
>    implicit none
> @@ -41,7 +41,7 @@ contains
>      ! shall call f01 with score 8.
>      ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } }
>      ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } }
> -    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } }
> +    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } }
>      ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } }
>      ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } }
>      a = f04 (x)
  

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index a0b142e0b94..207396de0ff 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1031,6 +1031,8 @@  namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+			      const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 11f5c5c500c..747131e684e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -953,14 +953,16 @@  static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
    and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
    mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
 			const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
     = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 31617510160..cba8879ab33 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -28527,7 +28527,7 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 					int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   unsigned HOST_WIDE_INT const_simdlen;
 
   if (!TARGET_SIMD)
@@ -28572,10 +28572,14 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
     {
       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+      wds_elt_bits = nds_elt_bits;
       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
     }
   else
-    nds_elt_bits = POINTER_SIZE;
+    {
+      nds_elt_bits = POINTER_SIZE;
+      wds_elt_bits = 0;
+    }
 
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
@@ -28583,44 +28587,72 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
        t && t != void_list_node; t = TREE_CHAIN (t), i++)
     {
-      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
+      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
-	  && !supported_simd_type (arg_type))
+	  && !supported_simd_type (type))
 	{
 	  if (!explicit_p)
 	    ;
-	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
+	  else if (COMPLEX_FLOAT_TYPE_P (type))
 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
 			"GCC does not currently support argument type %qT "
-			"for simd", arg_type);
+			"for simd", type);
 	  else
 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
 			"unsupported argument type %qT for simd",
-			arg_type);
+			type);
 	  return 0;
 	}
-      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
+      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
-	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
+	vec_elts.safe_push (std::make_pair (type, lane_bits));
       if (nds_elt_bits > lane_bits)
 	nds_elt_bits = lane_bits;
+      if (wds_elt_bits < lane_bits)
+	wds_elt_bits = lane_bits;
     }
 
-  clonei->vecsize_mangle = 'n';
+  /* If we could not determine the WDS type from available parameters/return,
+     then fallback to using uintptr_t.  */
+  if (wds_elt_bits == 0)
+    wds_elt_bits = POINTER_SIZE;
+
   clonei->mask_mode = VOIDmode;
   poly_uint64 simdlen;
-  auto_vec<poly_uint64> simdlens (2);
+  auto_vec<poly_uint64> simdlens (3);
+  auto_vec<char> simdmangle (3);
   /* Keep track of the possible simdlens the clones of this function can have,
      and check them later to see if we support them.  */
   if (known_eq (clonei->simdlen, 0U))
     {
       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
       if (maybe_ne (simdlen, 1U))
-	simdlens.safe_push (simdlen);
+	{
+	  simdlens.safe_push (simdlen);
+	  simdmangle.safe_push ('n');
+	}
       simdlens.safe_push (simdlen * 2);
+      simdmangle.safe_push ('n');
+      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
+	 function.
+	We have also disabled support for creating SVE simdclones for functions
+	with function bodies and any simdclones when -msve-vector-bits is used.
+	TODO: add support for these.  */
+      if ((DECL_ARGUMENTS (node->decl) != 0
+	   || type_arg_types != 0)
+	  && !node->definition
+	  && !aarch64_sve_vg.is_constant ())
+	{
+	  poly_uint64 sve_simdlen = aarch64_sve_vg * 64;
+	  simdlens.safe_push (exact_div (sve_simdlen, wds_elt_bits));
+	  simdmangle.safe_push ('s');
+	}
     }
   else
-    simdlens.safe_push (clonei->simdlen);
+    {
+      simdlens.safe_push (clonei->simdlen);
+      simdmangle.safe_push ('n');
+    }
 
   clonei->vecsize_int = 0;
   clonei->vecsize_float = 0;
@@ -28638,7 +28670,8 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
     {
       bool remove_simdlen = false;
       for (auto elt : vec_elts)
-	if (known_gt (simdlens[j] * elt.second, 128U))
+	if (simdmangle[j] == 'n'
+	    && known_gt (simdlens[j] * elt.second, 128U))
 	  {
 	    /* Don't issue a warning for every simdclone when there is no
 	       specific simdlen clause.  */
@@ -28651,12 +28684,14 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 	    break;
 	  }
       if (remove_simdlen)
-	simdlens.ordered_remove (j);
+	{
+	  simdlens.ordered_remove (j);
+	  simdmangle.ordered_remove (j);
+	}
       else
 	j++;
     }
 
-
   int count = simdlens.length ();
   if (count == 0)
     {
@@ -28675,20 +28710,107 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 
   gcc_assert (num < count);
   clonei->simdlen = simdlens[num];
+  clonei->vecsize_mangle = simdmangle[num];
+  /* SVE simdclones always have a Mask, so set inbranch to 1.  */
+  if (clonei->vecsize_mangle == 's')
+    clonei->inbranch = 1;
   return count;
 }
 
+static tree
+simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
+{
+    unsigned int num_zr = 0;
+    unsigned int num_pr = 0;
+    machine_mode vector_mode;
+    type = TREE_TYPE (type);
+    scalar_mode scalar_m = as_a <scalar_mode> (TYPE_MODE (type));
+    gcc_assert (aarch64_sve_data_mode (scalar_m,
+				       simdlen).exists (&vector_mode));
+    type = build_vector_type_for_mode (type, vector_mode);
+    if (is_mask)
+      {
+	type = truth_type_for (type);
+	num_pr = 1;
+      }
+    else
+      num_zr = 1;
+
+    aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL,
+					 NULL);
+    return type;
+}
+
 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
 
 static void
 aarch64_simd_clone_adjust (struct cgraph_node *node)
 {
-  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
-     use the correct ABI.  */
-
   tree t = TREE_TYPE (node->decl);
-  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
-					TYPE_ATTRIBUTES (t));
+  cl_target_option cur_target;
+  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree target = build_string (strlen ("+sve"), "+sve");
+      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);
+      cl_target_option_save (&cur_target, &global_options, &global_options_set);
+      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
+      cl_target_option_restore (&global_options, &global_options_set,
+				TREE_TARGET_OPTION (new_target));
+      aarch64_override_options_internal (&global_options);
+      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+	if (aarch64_sve_mode_p ((machine_mode) i))
+	  have_regs_of_mode[i] = true;
+    }
+  else
+    {
+	/* Add aarch64_vector_pcs target attribute to SIMD clones so they
+	   use the correct ABI.  */
+	TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
+					      TYPE_ATTRIBUTES (t));
+    }
+  cgraph_simd_clone *sc = node->simdclone;
+
+  for (unsigned i = 0; i < sc->nargs; ++i)
+    {
+      bool is_mask = false;
+      tree type;
+      switch (sc->args[i].arg_type)
+	{
+	case SIMD_CLONE_ARG_TYPE_MASK:
+	  is_mask = true;
+	  gcc_fallthrough ();
+	case SIMD_CLONE_ARG_TYPE_VECTOR:
+	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
+	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
+	  type = sc->args[i].vector_type;
+	  gcc_assert (VECTOR_TYPE_P (type));
+	  if (node->simdclone->vecsize_mangle == 's')
+	    type = simd_clone_adjust_sve_vector_type (type, is_mask,
+						      sc->simdlen);
+	  else if (is_mask)
+	    type = truth_type_for (type);
+	  sc->args[i].vector_type = type;
+	default:
+	    continue;
+	}
+    }
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree ret_type = TREE_TYPE (t);
+      if (VECTOR_TYPE_P (ret_type))
+	TREE_TYPE (t)
+	  = simd_clone_adjust_sve_vector_type (ret_type, false,
+					       node->simdclone->simdlen);
+      /* Restore current options.  */
+      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
+      aarch64_override_options_internal (&global_options);
+      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+    }
 }
 
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
@@ -28705,6 +28827,10 @@  aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info stmt_vinfo)
 	  && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))))
 	return -1;
       return 0;
+    case 's':
+      if (!TARGET_SVE)
+	return -1;
+      return 0;
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 864586207ee..066b6217253 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -541,9 +541,12 @@  simd_clone_mangle (struct cgraph_node *node,
   pp_string (&pp, "_ZGV");
   pp_character (&pp, vecsize_mangle);
   pp_character (&pp, mask);
-  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
-  unsigned int len = simdlen.to_constant ();
-  pp_decimal_int (&pp, (len));
+
+  unsigned long long len = 0;
+  if (simdlen.is_constant (&len))
+    pp_decimal_int (&pp, (int) (len));
+  else
+    pp_character (&pp, 'x');
 
   for (n = 0; n < clone_info->nargs; ++n)
     {
@@ -1533,8 +1536,8 @@  simd_clone_adjust (struct cgraph_node *node)
 	 below).  */
       loop = alloc_loop ();
       cfun->has_force_vectorize_loops = true;
-      /* For now, simlen is always constant.  */
-      loop->safelen = node->simdclone->simdlen.to_constant ();
+      /* We can assert that safelen is the 'minimum' simdlen.  */
+      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
       loop->force_vectorize = true;
       loop->header = body_bb;
     }
diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
index e3668893afe..2b71869787e 100644
--- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
+++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
@@ -1,6 +1,6 @@ 
-/* { dg-do compile { target vect_simd_clones } } */
+/* { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */
 /* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized" } */
-/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */
+/* { dg-additional-options "-mno-sse3" } */
 
 int f01 (int);
 int f02 (int);
@@ -15,15 +15,13 @@  int
 test1 (int x)
 {
   /* At gimplification time, we can't decide yet which function to call.  */
-  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
   /* After simd clones are created, the original non-clone test1 shall
      call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
      shall call f01 with score 8.  */
   /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */
-  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */
-  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */
-  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */
-  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" } } */
+  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" } } */
   int a = f04 (x);
   int b = f04 (x);
   return a + b;
diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
index e2e80f0c663..2f4d3a866e5 100644
--- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
@@ -43,6 +43,7 @@  float f04 (double a)
 }
 /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
 /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
+/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
 
 #pragma omp declare simd uniform(a) linear (b)
 void f05 (short a, short *b, short c)
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
new file mode 100644
index 00000000000..71fd361acec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
@@ -0,0 +1,52 @@ 
+/* { dg-do compile }  */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a+sve -mcpu=neoverse-n2" } */
+extern int __attribute__ ((simd, const)) fn0 (int);
+
+void test_fn0 (int *a, int *b, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] += fn0 (b[i]);
+}
+
+/* { dg-final { scan-assembler {_ZGVsMxv_fn0} } } */
+
+extern int __attribute__ ((simd, const)) fn1 (short, int);
+
+void test_fn1 (int *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = fn1 (c[i], b[i]);
+}
+
+/* { dg-final { scan-assembler {_ZGVsMxvv_fn1} } } */
+
+extern short __attribute__ ((simd, const)) fn2 (short, int);
+
+void test_fn2 (short *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = fn2 (c[i], b[i]);
+}
+
+/* { dg-final { scan-assembler {_ZGVsMxvv_fn2} } } */
+
+extern char __attribute__ ((simd, const)) fn3 (int, char);
+
+void test_fn3 (int *a, int *b, char *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
+}
+
+/* { dg-final { scan-assembler {_ZGVsMxvv_fn3} } } */
+
+extern short __attribute__ ((simd, const)) fn4 (int, short);
+
+void test_fn4 (int *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
+}
+
+/* { dg-final { scan-assembler {_ZGVsMxvv_fn4} } } */
diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
index 6319df0558f..3c7d093c5c6 100644
--- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
@@ -1,6 +1,6 @@ 
-! { dg-do compile { target vect_simd_clones } }
+! { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */
 ! { dg-additional-options "-O0 -fdump-tree-gimple -fdump-tree-optimized" }
-! { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } }
+! { dg-additional-options "-mno-sse3" }
 
 module main
   implicit none
@@ -41,7 +41,7 @@  contains
     ! shall call f01 with score 8.
     ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } }
     ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } }
-    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } }
+    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } }
     ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } }
     ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } }
     a = f04 (x)