[RFC,4/X] omp, aarch64: Add SVE support for 'omp declare simd' [PR 96342]

Message ID c4e8c0df-3c3b-852c-3e87-e54ead721fc8@arm.com
State New
Headers
Series [RFC,4/X] omp, aarch64: Add SVE support for 'omp declare simd' [PR 96342] |

Commit Message

Andre Vieira (lists) March 8, 2023, 4:25 p.m. UTC
  Hi,

This patch adds SVE support for simd clone generation when using 'omp 
declare simd'. The design is based on what was discussed in PR 96342, 
but I did not look at YangYang's patch as I wasn't sure of whether that 
code's copyright had been assigned to FSF.

This patch also is not in accordance with the examples in the BETA 
VFABIA64 document that can be found in the vfabia64 subdir of 
https://github.com/ARM-software/abi-aa/
If we agree to this approach I will propose changes to the ABI.
It differs in that we take the ommission of 'simdlen' to be the only way 
to create a SVE simd clone using 'omp declare simd', and that the 
current target defined on the command-line has no bearing in what simd 
clones are generated. This SVE simd clone is always VLA.
The document describes a way to specify SVE simdclones of VLS by using 
the simdlen clause, but that would require another way to toggle between 
SVE and Advanced SIMD and since there is no clause to do that for 'omp 
declare simd' I would have to assume this would be controllable by the 
command-line target options (march/mcpu).
By generating all possible Advanced SIMD simdlens and a VLA simdlen for 
SVE when ommitting simdlen we would be adhering to the same practice 
x86_64 does.

Targethook changes

This patch proposes two targethook changes:
1) Add mode parameter to TARGET_SIMD_CLONE_USABLE
We require the mode parameter to distinguish between calls to a simd 
clone from a Advanced SIMD mode and a SVE mode.

2) Add new TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
We require this to be able to modify the types used in SVE simd clones, 
as we need to add the SVE type attribute so that the correct PCS can be 
applied.

Other notable changes:
- We discourage the use of an 'inbranch' simdclone for when the caller 
is not in a branch, such that it picks a 'notinbranch' variant if 
available over an inbranch one. (we could probably rely on ordering but 
that's quite error prone and the ordering I'm looking at is by 
definition target specific).
- I currently put the VLA mangling in the target agnostic mangling 
function, if other targets with VLA want to use a different mangling in 
the future we may want to change this into a targethook.


I'll create a ChangeLog when I turn this into a PATCH if we agree on 
this direction.
  

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f75eb892f3daa7c2576efcedc8d944ab1e895cdb..122a473770eb4526ecce326f02d843608d088b5b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -995,6 +995,8 @@  namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+			      const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@  static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
    and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
    mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
 			const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
     = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5c40b6ed22a508723bd535a7460762c3a243d441..ef93a4e9d43799df4410f152cdd798db285e8897 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4015,13 +4015,13 @@  aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-    return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
       || aarch64_takes_arguments_in_sve_regs_p (fntype))
     return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+    return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -26968,14 +26968,21 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 	}
     }
 
-  clonei->vecsize_mangle = 'n';
   clonei->mask_mode = VOIDmode;
   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
   if (known_eq (clonei->simdlen, 0U))
     {
-      count = 2;
-      vec_bits = (num == 0 ? 64 : 128);
-      clonei->simdlen = exact_div (vec_bits, elt_bits);
+      if (num >= 2)
+	{
+	  vec_bits = poly_uint64 (128, 128);
+	  clonei->simdlen = exact_div (vec_bits, elt_bits);
+	}
+      else
+	{
+	  count = 3;
+	  vec_bits = (num == 0 ? 64 : 128);
+	  clonei->simdlen = exact_div (vec_bits, elt_bits);
+	}
     }
   else
     {
@@ -26994,6 +27001,15 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 	  return 0;
 	}
     }
+
+  if (num >= 2)
+    {
+      clonei->vecsize_mangle = 's';
+      clonei->inbranch = 1;
+    }
+  else
+    clonei->vecsize_mangle = 'n';
+
   clonei->vecsize_int = vec_bits;
   clonei->vecsize_float = vec_bits;
   return count;
@@ -27010,17 +27026,28 @@  aarch64_simd_clone_adjust (struct cgraph_node *node)
   tree t = TREE_TYPE (node->decl);
   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
 					TYPE_ATTRIBUTES (t));
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree target = build_string (strlen ("+sve"), "+sve");
+      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);
+    }
 }
 
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
 
 static int
-aarch64_simd_clone_usable (struct cgraph_node *node)
+aarch64_simd_clone_usable (struct cgraph_node *node, machine_mode vector_mode)
 {
   switch (node->simdclone->vecsize_mangle)
     {
     case 'n':
-      if (!TARGET_SIMD)
+      if (!TARGET_SIMD
+	  || aarch64_sve_mode_p (vector_mode))
+	return -1;
+      return 0;
+    case 's':
+      if (!TARGET_SVE
+	  || !aarch64_sve_mode_p (vector_mode))
 	return -1;
       return 0;
     default:
@@ -27028,6 +27055,61 @@  aarch64_simd_clone_usable (struct cgraph_node *node)
     }
 }
 
+/* Implement TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM.  */
+
+static tree
+aarch64_simd_clone_adjust_ret_or_param (struct cgraph_node *node, tree type,
+					bool is_mask)
+{
+  if (type
+      && VECTOR_TYPE_P (type)
+      && node->simdclone->vecsize_mangle == 's')
+    {
+      cl_target_option cur_target;
+      cl_target_option_save (&cur_target, &global_options, &global_options_set);
+      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
+      cl_target_option_restore (&global_options, &global_options_set,
+				TREE_TARGET_OPTION (new_target));
+      aarch64_override_options_internal (&global_options);
+      bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+	if (aarch64_sve_mode_p ((machine_mode) i))
+	  have_regs_of_mode[i] = true;
+      poly_uint16 old_sve_vg = aarch64_sve_vg;
+      if (!node->simdclone->simdlen.is_constant ())
+	aarch64_sve_vg = poly_uint16 (2, 2);
+      unsigned int num_zr = 0;
+      unsigned int num_pr = 0;
+      if (is_mask)
+	{
+	  type = truth_type_for (type);
+	  num_pr = 1;
+	}
+      else
+	{
+	  num_zr = 1;
+	  tree base_type = TREE_TYPE (type);
+	  if (POINTER_TYPE_P (base_type))
+	    base_type = pointer_sized_int_node;
+	  poly_int64 vec_size = tree_to_poly_int64 (TYPE_SIZE (type));
+	  scalar_mode base_mode = as_a <scalar_mode> (TYPE_MODE (base_type));
+	  machine_mode vec_mode
+	    = aarch64_simd_container_mode (base_mode, vec_size);
+	  type = build_vector_type_for_mode (base_type, vec_mode);
+	}
+
+      aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, NULL);
+      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
+      aarch64_override_options_internal (&global_options);
+      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      aarch64_sve_vg = old_sve_vg;
+    }
+  return type;
+}
+
 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
 
 static int
@@ -28048,6 +28130,10 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SIMD_CLONE_USABLE
 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
 
+#undef TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
+#define TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM \
+  aarch64_simd_clone_adjust_ret_or_param
+
 #undef TARGET_COMP_TYPE_ATTRIBUTES
 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c6c891972d1e58cd163b259ba96a599d62326865..ed12271027305a0017cb9b2ff821bad403c52836 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6306,11 +6306,16 @@  This hook should add implicit @code{attribute(target("..."))} attribute
 to SIMD clone @var{node} if needed.
 @end deftypefn
 
-@deftypefn {Target Hook} int TARGET_SIMD_CLONE_USABLE (struct cgraph_node *@var{})
+@deftypefn {Target Hook} int TARGET_SIMD_CLONE_USABLE (struct cgraph_node *@var{}, @var{machine_mode})
 This hook should return -1 if SIMD clone @var{node} shouldn't be used
-in vectorized loops in current function, or non-negative number if it is
-usable.  In that case, the smaller the number is, the more desirable it is
-to use it.
+in vectorized loops being vectorized with mode @var{m} in current function, or
+non-negative number if it is usable.  In that case, the smaller the number is,
+the more desirable it is to use it.
+@end deftypefn
+
+@deftypefn {Target Hook} tree TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM (struct cgraph_node *@var{}, @var{tree}, @var{bool})
+If defined, this hook should adjust the type of the return or parameter
+@var{type} to be used by the simd clone @var{node}.
 @end deftypefn
 
 @deftypefn {Target Hook} int TARGET_SIMT_VF (void)
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 613b2534149415f442163d599503efaf423b673b..fd0d2c8d0dcc2fd249b34745d77749d99c49d13d 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4205,6 +4205,8 @@  address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_SIMD_CLONE_USABLE
 
+@hook TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
+
 @hook TARGET_SIMT_VF
 
 @hook TARGET_OMP_DEVICE_KIND_ARCH_ISA
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 48b480e7556d9ad8e5502e10e513ec36b17b9cbb..4808608b7a1c06802ee231480c2003cf41c11799 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -378,8 +378,9 @@  simd_clone_clauses_extract (struct cgraph_node *node, tree clauses,
 		  arg_type = SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP;
 		clone_info->args[argno].arg_type = arg_type;
 		clone_info->args[argno].linear_step = tree_to_shwi (step);
+		int nargs = clone_info->nargs;
 		gcc_assert (clone_info->args[argno].linear_step >= 0
-			    && clone_info->args[argno].linear_step < n);
+			    && clone_info->args[argno].linear_step < nargs);
 	      }
 	    else
 	      {
@@ -541,9 +542,12 @@  simd_clone_mangle (struct cgraph_node *node,
   pp_string (&pp, "_ZGV");
   pp_character (&pp, vecsize_mangle);
   pp_character (&pp, mask);
-  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
-  unsigned int len = simdlen.to_constant ();
-  pp_decimal_int (&pp, (len));
+
+  unsigned long long len = 0;
+  if (simdlen.is_constant (&len))
+    pp_decimal_int (&pp, (int) (len));
+  else
+    pp_character (&pp, 'x');
 
   for (n = 0; n < clone_info->nargs; ++n)
     {
@@ -736,6 +740,7 @@  simd_clone_adjust_return_type (struct cgraph_node *node)
       t = build_array_type_nelts (t, exact_div (node->simdclone->simdlen,
 						veclen));
     }
+  t = targetm.simd_clone.adjust_ret_or_param (node, t, false);
   TREE_TYPE (TREE_TYPE (fndecl)) = t;
   if (!node->definition)
     return NULL_TREE;
@@ -748,6 +753,7 @@  simd_clone_adjust_return_type (struct cgraph_node *node)
 
   tree atype = build_array_type_nelts (orig_rettype,
 				       node->simdclone->simdlen);
+  atype = targetm.simd_clone.adjust_ret_or_param (node, atype, false);
   if (maybe_ne (veclen, node->simdclone->simdlen))
     return build1 (VIEW_CONVERT_EXPR, atype, t);
 
@@ -807,8 +813,14 @@  simd_clone_adjust_argument_types (struct cgraph_node *node)
     {
       ipa_adjusted_param adj;
       memset (&adj, 0, sizeof (adj));
-      tree parm = args[i];
-      tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+      tree parm = NULL_TREE;
+      tree parm_type = NULL_TREE;
+      if(i < args.length())
+	{
+	  parm = args[i];
+	  parm_type = node->definition ? TREE_TYPE (parm) : parm;
+	}
+
       adj.base_index = i;
       adj.prev_clone_index = i;
 
@@ -874,6 +886,8 @@  simd_clone_adjust_argument_types (struct cgraph_node *node)
 				       ? IDENTIFIER_POINTER (DECL_NAME (parm))
 				       : NULL, parm_type, sc->simdlen);
 	}
+      adj.type = targetm.simd_clone.adjust_ret_or_param (node, adj.type,
+							 false);
       vec_safe_push (new_params, adj);
     }
 
@@ -906,6 +920,8 @@  simd_clone_adjust_argument_types (struct cgraph_node *node)
 	adj.type = build_vector_type (pointer_sized_int_node, veclen);
       else
 	adj.type = build_vector_type (base_type, veclen);
+      adj.type = targetm.simd_clone.adjust_ret_or_param (node, adj.type,
+							 true);
       vec_safe_push (new_params, adj);
 
       k = vector_unroll_factor (sc->simdlen, veclen);
@@ -931,6 +947,7 @@  simd_clone_adjust_argument_types (struct cgraph_node *node)
 	    sc->args[i].simd_array = NULL_TREE;
 	}
       sc->args[i].orig_type = base_type;
+      sc->args[i].vector_type = adj.type;
       sc->args[i].arg_type = SIMD_CLONE_ARG_TYPE_MASK;
     }
 
@@ -1485,8 +1502,8 @@  simd_clone_adjust (struct cgraph_node *node)
 	 below).  */
       loop = alloc_loop ();
       cfun->has_force_vectorize_loops = true;
-      /* For now, simlen is always constant.  */
-      loop->safelen = node->simdclone->simdlen.to_constant ();
+      /* We can assert that safelen is the 'minimum' simdlen.  */
+      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
       loop->force_vectorize = true;
       loop->header = body_bb;
     }
@@ -1546,7 +1563,7 @@  simd_clone_adjust (struct cgraph_node *node)
 	  mask = gimple_assign_lhs (g);
 	  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
 				   BIT_AND_EXPR, mask,
-				   build_int_cst (TREE_TYPE (mask), 1));
+				   build_one_cst (TREE_TYPE (mask)));
 	  gsi_insert_after (&gsi, g, GSI_CONTINUE_LINKING);
 	  mask = gimple_assign_lhs (g);
 	}
diff --git a/gcc/target.def b/gcc/target.def
index db8af0cbe81624513f114fc9bbd8be61d855f409..ffa12aa9023bb8f26a647a9848800c77f34afc67 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1645,10 +1645,18 @@  void, (struct cgraph_node *), NULL)
 DEFHOOK
 (usable,
 "This hook should return -1 if SIMD clone @var{node} shouldn't be used\n\
-in vectorized loops in current function, or non-negative number if it is\n\
-usable.  In that case, the smaller the number is, the more desirable it is\n\
-to use it.",
-int, (struct cgraph_node *), NULL)
+in vectorized loops being vectorized with mode @var{m} in current function, or\n\
+non-negative number if it is usable.  In that case, the smaller the number is,\n\
+the more desirable it is to use it.",
+int, (struct cgraph_node *, machine_mode), NULL)
+
+DEFHOOK
+(adjust_ret_or_param,
+"If defined, this hook should adjust the type of the return or parameter\n\
+@var{type} to be used by the simd clone @var{node}.",
+tree, (struct cgraph_node *, tree, bool),
+default_simd_clone_adjust_ret_or_param)
+
 
 HOOK_VECTOR_END (simd_clone)
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index a1df260f5483dc84f18d8f12c5202484a32d5bb7..860fb8ccbf1ab00c43dc4b4d32808c1f488406e4 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -73,6 +73,9 @@  extern void default_print_operand (FILE *, rtx, int);
 extern void default_print_operand_address (FILE *, machine_mode, rtx);
 extern bool default_print_operand_punct_valid_p (unsigned char);
 extern tree default_mangle_assembler_name (const char *);
+extern tree default_simd_clone_adjust_ret_or_param
+  (struct cgraph_node *,tree , bool);
+
 
 extern machine_mode default_translate_mode_attribute (machine_mode);
 extern bool default_scalar_mode_supported_p (scalar_mode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index fe0116521feaf32187e7bc113bf93b1805852c79..4e54ceb0297828cf13e418dfa113651670a6f112 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -398,6 +398,16 @@  default_mangle_assembler_name (const char *name ATTRIBUTE_UNUSED)
   return get_identifier (stripped);
 }
 
+/* The default implementation of TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM.  */
+
+tree
+default_simd_clone_adjust_ret_or_param (struct cgraph_node *node ATTRIBUTE_UNUSED,
+					tree type,
+					bool is_return ATTRIBUTE_UNUSED)
+{
+  return type;
+}
+
 /* The default implementation of TARGET_TRANSLATE_MODE_ATTRIBUTE.  */
 
 machine_mode
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c85b6babc4bc5bc3111ef326dcc8f32bb25333f6..da6aa3f193bd52a1e40bb6dbe3d483f92ecd7896 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2759,7 +2759,8 @@  vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
     return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+	   || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
     {
       tree mask = build_int_cst (TREE_TYPE (masktype), -1);
       mask = build_vector_from_val (masktype, mask);
@@ -4136,14 +4137,6 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
     }
 
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  if (!vf.is_constant ())
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not considering SIMD clones; not yet supported"
-			 " for variable-width vectors.\n");
-      return false;
-    }
 
   unsigned int badness = 0;
   struct cgraph_node *bestn = NULL;
@@ -4156,20 +4149,17 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	unsigned int this_badness = 0;
 	unsigned int num_calls;
 	if (!constant_multiple_p (vf, n->simdclone->simdlen, &num_calls)
-	    || n->simdclone->nargs != nargs)
+	    || n->simdclone->nargs != (nargs + n->simdclone->inbranch))
 	  continue;
 	if (num_calls != 1)
 	  this_badness += exact_log2 (num_calls) * 4096;
 	if (n->simdclone->inbranch)
 	  this_badness += 8192;
-	int target_badness = targetm.simd_clone.usable (n);
+	int target_badness = targetm.simd_clone.usable (n, vinfo->vector_mode);
 	if (target_badness < 0)
 	  continue;
 	this_badness += target_badness * 512;
-	/* FORNOW: Have to add code to add the mask argument.  */
-	if (n->simdclone->inbranch)
-	  continue;
-	for (i = 0; i < nargs; i++)
+	for (i = 0; i < n->simdclone->nargs; i++)
 	  {
 	    switch (n->simdclone->args[i].arg_type)
 	      {
@@ -4206,16 +4196,22 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		i = -1;
 		break;
 	      case SIMD_CLONE_ARG_TYPE_MASK:
-		gcc_unreachable ();
+		/* Penalize using a predicated SIMD clone in a non-masked loop,
+		   as we'd have to needlessly construct an all-true mask.  */
+		if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+		  this_badness += 64;
+		break;
 	      }
 	    if (i == (size_t) -1)
 	      break;
-	    if (n->simdclone->args[i].alignment > arginfo[i].align)
+	    if (i < nargs
+		&& n->simdclone->args[i].alignment > arginfo[i].align)
 	      {
 		i = -1;
 		break;
 	      }
-	    if (arginfo[i].align)
+	    if (i < nargs
+		&& arginfo[i].align)
 	      this_badness += (exact_log2 (arginfo[i].align)
 			       - exact_log2 (n->simdclone->args[i].alignment));
 	  }
@@ -4248,6 +4244,7 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
   fndecl = bestn->decl;
   nunits = bestn->simdclone->simdlen;
   ncopies = vector_unroll_factor (vf, nunits);
+  nargs = bestn->simdclone->nargs;
 
   /* If the function isn't const, only allow it in simd loops where user
      has asserted that at least nunits consecutive iterations can be
@@ -4331,11 +4328,45 @@  vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
       for (i = 0; i < nargs; i++)
 	{
-	  unsigned int k, l, m, o;
+	  unsigned long long k, l, m, o;
 	  tree atype;
-	  op = gimple_call_arg (stmt, i);
+	  if (i < gimple_call_num_args (stmt))
+	    op = gimple_call_arg (stmt, i);
+	  else
+	    op = NULL_TREE;
+
 	  switch (bestn->simdclone->args[i].arg_type)
 	    {
+	    case SIMD_CLONE_ARG_TYPE_MASK:
+		{
+		    tree mask;
+		    atype = bestn->simdclone->args[i].vector_type;
+		    if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+		      {
+			vec_loop_masks *loop_masks
+			  = &LOOP_VINFO_MASKS (loop_vinfo);
+			mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
+						   vectype, j);
+		      }
+		    else
+		      {
+			tree mask_type = bestn->simdclone->args[i].vector_type;
+			mask
+			  = vect_build_all_ones_mask (vinfo, stmt_info,
+						      mask_type);
+		      }
+		    if (!useless_type_conversion_p (TREE_TYPE (mask), atype))
+		      {
+			mask = build1 (VIEW_CONVERT_EXPR, atype, mask);
+			gassign *new_stmt
+			  = gimple_build_assign (make_ssa_name (atype), mask);
+			vect_finish_stmt_generation (vinfo, stmt_info,
+						     new_stmt, gsi);
+			mask = gimple_assign_lhs (new_stmt);
+		      }
+		    vargs.safe_push (mask);
+		}
+	      break;
 	    case SIMD_CLONE_ARG_TYPE_VECTOR:
 	      atype = bestn->simdclone->args[i].vector_type;
 	      o = vector_unroll_factor (nunits,