vect: Add bias parameter for partial vectorization

Message ID 440687a0-d6e5-50e0-7105-7914b910c8c6@linux.ibm.com
State New
Headers
Series vect: Add bias parameter for partial vectorization |

Commit Message

Robin Dapp Oct. 28, 2021, 2:44 p.m. UTC
  Hi,

as discussed in
https://gcc.gnu.org/pipermail/gcc-patches/2021-October/582627.html this
introduces a bias parameter for the len_load/len_store ifns as well as
optabs that is meant to distinguish between Power and s390 variants.
The default is a bias of 0, while in s390's case vll/vstl do not support
lengths of zero bytes and a bias of -1 should be used.

Bootstrapped and regtested on Power9 (--with-cpu=power9) and s390
(--with-arch=z15).

The tiny changes in the Power backend I will post separately.

Regards
 Robin
  

Comments

Kewen.Lin Oct. 29, 2021, 3:10 a.m. UTC | #1
Hi Robin,

on 2021/10/28 下午10:44, Robin Dapp wrote:
> Hi,
> 
> as discussed in
> https://gcc.gnu.org/pipermail/gcc-patches/2021-October/582627.html this
> introduces a bias parameter for the len_load/len_store ifns as well as
> optabs that is meant to distinguish between Power and s390 variants.
> The default is a bias of 0, while in s390's case vll/vstl do not support
> lengths of zero bytes and a bias of -1 should be used.
> 
> Bootstrapped and regtested on Power9 (--with-cpu=power9) and s390
> (--with-arch=z15).
> 
> The tiny changes in the Power backend I will post separately.
> 

Thanks for extending this!

I guess your separated Power (rs6000) patch will be committed with this one
together? otherwise I'm worried that those existing rs6000 partial vector
cases could fail since the existing rs6000 optabs miss the new operand which
isn't optional.

You might need to update the documentation doc/md.texi for the new operand
in sections len_load_@var{m} and len_store_@var{m}, and might want to add
the costing consideration for this non-zero biasing in hunk
"
  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
    {
"
of function vect_estimate_min_profitable_iters.

I may think too much, it seems we can have one assertion in function
vect_verify_loop_lens to ensure the (internal_len_load_bias_supported ==
internal_len_load_bias_supported) to avoid some mixed biasing cases from
some weird targets or optab typos.

BR,
Kewen
  
Richard Sandiford Oct. 29, 2021, 7:46 a.m. UTC | #2
Robin Dapp <rdapp@linux.ibm.com> writes:
> Hi,
>
> as discussed in
> https://gcc.gnu.org/pipermail/gcc-patches/2021-October/582627.html this
> introduces a bias parameter for the len_load/len_store ifns as well as
> optabs that is meant to distinguish between Power and s390 variants.
> The default is a bias of 0, while in s390's case vll/vstl do not support
> lengths of zero bytes and a bias of -1 should be used.
>
> Bootstrapped and regtested on Power9 (--with-cpu=power9) and s390
> (--with-arch=z15).
>
> The tiny changes in the Power backend I will post separately.

Some comments in addition to what Kewen said:

> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8312d08aab2..993e32c1854 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
>  static void
>  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, target, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, target, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2727,7 +2727,16 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
>    else
>      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +  if (optab == len_load_optab)
> +    {
> +      biast = gimple_call_arg (stmt, 3);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
> +  else
> +    expand_insn (icode, 3, ops);
> +

The previous “if“ is also for len_load_optab, so it seems better to
combine the two.

>    if (!rtx_equal_p (target, ops[0].value))
>      emit_move_insn (target, ops[0].value);
>  }
> @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  static void
>  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, reg, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, reg, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2770,7 +2779,16 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
>    else
>      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +
> +  if (optab == len_store_optab)
> +    {
> +      biast = gimple_call_arg (stmt, 4);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
> +  else
> +    expand_insn (icode, 3, ops);

Same idea here.

>  }
>  
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
> @@ -4172,6 +4190,30 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
>  	  && insn_operand_matches (icode, 4, GEN_INT (align)));
>  }
>  
> +/* Return the supported bias for the len_load IFN.  For now we support a
> +   default bias of 0 and -1 in case 0 is not an allowable length for len_load.

Neither bias is really the default.  How about just “For now we only
support the biases 0 and -1.”?

> +   If none of these biases match what the backend provides, return
> +   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
> +
> +signed char
> +internal_len_load_bias_supported (internal_fn ifn, machine_mode mode)

Since this is now providing the bias rather than asking about a
given bias (thanks), I think we should drop “_supported”.

> +{
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      /* We only support a bias of 0 (default) or -1.  Try both
> +	 of them.  */
> +      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +	return 0;
> +      else if (insn_operand_matches (icode, 3, GEN_INT (-1)))

Minor nit, but: redundant else.

> +	return -1;
> +    }
> +
> +  return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +}
> +
>  /* Expand STMT as though it were a call to internal function FN.  */
>  
>  void
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 19d0f849a5a..af28cf0d566 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>  						    tree, tree, int);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>  						poly_uint64, unsigned int);
> +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
> +
> +extern signed char internal_len_load_bias_supported (internal_fn ifn,
> +						     machine_mode);
>  
>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>  				    bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..cd2c33fc4a7 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,15 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  opt_machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false);
> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (internal_len_load_bias_supported (IFN_LEN_LOAD, len_load_mode.require ())
> +      == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)

Might as well add the require () to the get_len_load_store_mode call.

As Kewen says, it would be good to store the bias computed here in the
loop_vec_info and assert that all users agree.

If we do that, then there's also the option of adding the bias
to the lengths in vect_set_loop_controls_directly.

> +      return false;
> +
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 17849b575b7..c3df26c8009 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -8289,9 +8289,30 @@ vectorizable_store (vec_info *vinfo,
>  						   gsi);
>  		      vec_oprnd = var;
>  		    }
> +
> +		  /* Check which bias value to use.  Default is 0.
> +		     A bias of -1 means that we cannot emit a LEN_LOAD with
> +		     a length of 0 and need to subtract 1 from the length.  */
> +		  char biasval = internal_len_load_bias_supported
> +		    (IFN_LEN_STORE, new_vmode);
> +		  tree bias = build_int_cst (intQI_type_node, biasval);
> +		  tree new_len = final_len;
> +		  if (biasval != 0
> +		      && biasval != VECT_PARTIAL_BIAS_UNSUPPORTED)
> +		    {
> +		      new_len = make_ssa_name (TREE_TYPE (final_len));
> +		      gassign *m1
> +			= gimple_build_assign (new_len, MINUS_EXPR,
> +					       final_len,
> +					       build_one_cst (TREE_TYPE
> +							  (final_len)));
> +		      vect_finish_stmt_generation (vinfo, stmt_info, m1,
> +						   gsi);
> +		    }

If we don't do that and add the bias at the use site instead,
then it'd be good to split this out into a subroutine so that
the load and store code can share it.

Thanks,
Richard

>  		  gcall *call
> -		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
> -						  ptr, final_len, vec_oprnd);
> +		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> +						  ptr, new_len, vec_oprnd,
> +						  bias);
>  		  gimple_call_set_nothrow (call, true);
>  		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>  		  new_stmt = call;
> @@ -9588,24 +9609,46 @@ vectorizable_load (vec_info *vinfo,
>  					       vec_num * j + i);
>  			tree ptr = build_int_cst (ref_type,
>  						  align * BITS_PER_UNIT);
> +
> +			machine_mode vmode = TYPE_MODE (vectype);
> +			opt_machine_mode new_ovmode
> +			  = get_len_load_store_mode (vmode, true);
> +			machine_mode new_vmode = new_ovmode.require ();
> +			tree qi_type = unsigned_intQI_type_node;
> +			tree new_vtype
> +			  = build_vector_type_for_mode (qi_type, new_vmode);
> +
> +			/* Check which bias value to use.  Default is 0. */
> +			char biasval = internal_len_load_bias_supported
> +			  (IFN_LEN_LOAD, new_vmode);
> +
> +			tree bias = build_int_cst (intQI_type_node, biasval);
> +			tree new_len = final_len;
> +			if (biasval != 0
> +			    && biasval != VECT_PARTIAL_BIAS_UNSUPPORTED)
> +			  {
> +			    new_len = make_ssa_name (TREE_TYPE (final_len));
> +			    gassign *m1 = gimple_build_assign (new_len,
> +							       MINUS_EXPR,
> +							       final_len,
> +							       build_one_cst
> +							       (TREE_TYPE
> +								(final_len)));
> +			    vect_finish_stmt_generation (vinfo, stmt_info, m1,
> +							 gsi);
> +			  }
> +
>  			gcall *call
> -			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
> +			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>  							dataref_ptr, ptr,
> -							final_len);
> +							new_len, bias);
>  			gimple_call_set_nothrow (call, true);
>  			new_stmt = call;
>  			data_ref = NULL_TREE;
>  
>  			/* Need conversion if it's wrapped with VnQI.  */
> -			machine_mode vmode = TYPE_MODE (vectype);
> -			opt_machine_mode new_ovmode
> -			  = get_len_load_store_mode (vmode, true);
> -			machine_mode new_vmode = new_ovmode.require ();
>  			if (vmode != new_vmode)
>  			  {
> -			    tree qi_type = unsigned_intQI_type_node;
> -			    tree new_vtype
> -			      = build_vector_type_for_mode (qi_type, new_vmode);
>  			    tree var = vect_get_new_ssa_name (new_vtype,
>  							      vect_simple_var);
>  			    gimple_set_lhs (call, var);
  
Robin Dapp Nov. 2, 2021, 8:16 p.m. UTC | #3
Hi,

thanks for the helpful comments. The attached v2 addresses the following
points from them:

 - Save the bias in loop_vinfo and set it once in vect_verify_loop_lens.
 - Add code to handle the bias in vect_set_loop_controls_directly.
 - Adjust costing.
 - Add comments for the new len_load parameter as well as change wording.
 - Include the rs6000 change directly.

I'm not entirely satisfied with the bias code for the loop controls,
mainly because of side effects I might not have considered.  The test
suites show no new regressions and bootstrap succeeded, though.

Regards
 Robin
  
Kewen.Lin Nov. 4, 2021, 3:59 a.m. UTC | #4
Hi Robin,

on 2021/11/3 上午4:16, Robin Dapp wrote:
> Hi,
> 
> thanks for the helpful comments. The attached v2 addresses the following
> points from them:
> 
>  - Save the bias in loop_vinfo and set it once in vect_verify_loop_lens.
>  - Add code to handle the bias in vect_set_loop_controls_directly.
>  - Adjust costing.
>  - Add comments for the new len_load parameter as well as change wording.
>  - Include the rs6000 change directly.
> 
> I'm not entirely satisfied with the bias code for the loop controls,
> mainly because of side effects I might not have considered.  The test
> suites show no new regressions and bootstrap succeeded, though.
> 
> Regards
>  Robin
> 
> 
> vll-v2.patch
> 
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index bf033e31c1c..dc2756f83e9 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5637,7 +5637,8 @@
>  (define_expand "len_load_v16qi"
>    [(match_operand:V16QI 0 "vlogical_operand")
>     (match_operand:V16QI 1 "memory_operand")
> -   (match_operand:QI 2 "gpc_reg_operand")]
> +   (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {
>    rtx mem = XEXP (operands[1], 0);
> @@ -5651,6 +5652,7 @@
>    [(match_operand:V16QI 0 "memory_operand")
>     (match_operand:V16QI 1 "vlogical_operand")
>     (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")
>    ]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {

Nice, thanks!

> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 2b41cb7fb7b..265c76f1609 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5213,7 +5213,10 @@ which must be a vector mode.  Operand 2 has whichever integer mode the
>  target prefers.  If operand 2 exceeds the number of elements in mode
>  @var{m}, the behavior is undefined.  If the target prefers the length
>  to be measured in bytes rather than elements, it should only implement
> -this pattern for vectors of @code{QI} elements.
> +this pattern for vectors of @code{QI} elements.  Operand 3 specifies
> +a bias predicate that determines whether a length of zero is permitted
> +or not.  If permitted, the predicate should only allow a zero immediate,
> +otherwhise it should only allow an immediate value of -1.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> @@ -5226,7 +5229,10 @@ a vector mode.  Operand 2 has whichever integer mode the target prefers.
>  If operand 2 exceeds the number of elements in mode @var{m}, the behavior
>  is undefined.  If the target prefers the length to be measured in bytes
>  rather than elements, it should only implement this pattern for vectors
> -of @code{QI} elements.
> +of @code{QI} elements.  Operand 3 specifies a bias predicate that
> +determines whether a length of zero is permitted or not.  If permitted,
> +the predicate should only allow a zero constant, otherwhise it should
> +only allow an immediate value of -1.
>  

Nit: s/otherwhise/otherwise/ (same for len_load).

Since these optabs are also for length in elements (although there is not
this kind of usage), I guess the current bias -1 support would work well
for length in elements too?  Nice!  :)

>  This pattern is not allowed to @code{FAIL}.
>  
> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8312d08aab2..90fdc440248 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
>  static void
>  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, target, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, target, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    create_output_operand (&ops[0], target, TYPE_MODE (type));
>    create_fixed_operand (&ops[1], mem);
>    if (optab == len_load_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> -				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 3);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> +    {
>      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    expand_insn (icode, 3, ops);
> +    }
> +
>    if (!rtx_equal_p (target, ops[0].value))
>      emit_move_insn (target, ops[0].value);
>  }
> @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  static void
>  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, reg, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, reg, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    create_fixed_operand (&ops[0], mem);
>    create_input_operand (&ops[1], reg, TYPE_MODE (type));
>    if (optab == len_store_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> -				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 4);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> -    create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    {
> +      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      expand_insn (icode, 3, ops);
> +    }
>  }
>  
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
> @@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
>  	  && insn_operand_matches (icode, 4, GEN_INT (align)));
>  }
>  
> +/* Return the supported bias for the len_load IFN.  For now we only support
> +   the biases of 0 and -1 (in case 0 is not an allowable length for len_load).
> +   If none of the biases match what the backend provides, return
> +   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
> +
> +signed char
> +internal_len_load_bias (internal_fn ifn, machine_mode mode)

Nit: this function is used for both len_load and len_store.  We have a similar
function name get_len_load_store_mode, maybe it's better to align it with 
a name like internal_len_load_store_bias?

If it's a good idea, it would be also applied for LOOP_VINFO_PARTIAL_LOAD_BIAS
(to LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS).

> +{
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      /* For now we only support biases of 0 or -1.  Try both of them.  */
> +      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +	return 0;
> +      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
> +	return -1;
> +    }
> +
> +  return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +}
> +
>  /* Expand STMT as though it were a call to internal function FN.  */
>  
>  void
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 19d0f849a5a..c057df1327a 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>  						    tree, tree, int);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>  						poly_uint64, unsigned int);
> +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
> +
> +extern signed char internal_len_load_bias (internal_fn ifn,
> +					   machine_mode);
>  
>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>  				    bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 4988c93fdb6..79cf5029faf 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -436,7 +436,14 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    tree length_limit = NULL_TREE;
>    /* For length, we need length_limit to ensure length in range.  */
>    if (!use_masks_p)
> -    length_limit = build_int_cst (compare_type, nitems_per_ctrl);
> +    {
> +      int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo);
> +      if (partial_load_bias == 0)
> +	length_limit = build_int_cst (compare_type, nitems_per_ctrl);
> +      else
> +	length_limit = build_int_cst (compare_type,
> +				      nitems_per_ctrl + partial_load_bias);
> +    }

Nit: this could be simplified as

  length_limit = build_int_cst (compare_type, nitems_per_ctrl + partial_load_bias);
 
>  
>    /* Calculate the maximum number of item values that the rgroup
>       handles in total, the number that it handles for each iteration
> @@ -560,8 +567,12 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>      {
>        /* Previous controls will cover BIAS items.  This control covers the
>  	 next batch.  */
> +      tree bias_tree;
>        poly_uint64 bias = nitems_per_ctrl * i;
> -      tree bias_tree = build_int_cst (compare_type, bias);
> +      int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo);
> +      if (partial_load_bias != 0)
> +	bias -= partial_load_bias;
> +      bias_tree = build_int_cst (compare_type, bias);
>  
>        /* See whether the first iteration of the vector loop is known
>  	 to have a full control.  */
> @@ -574,7 +585,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  	 TEST_LIMIT, prefer to use the same 0-based IV for each control
>  	 and adjust the bound down by BIAS.  */
>        tree this_test_limit = test_limit;
> -      if (i != 0)
> +      if (i != 0 || (i == 0 && partial_load_bias != 0))
>  	{
>  	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
>  					  compare_type, this_test_limit,

The changes in gcc/tree-vect-loop-manip.c seem not work for all cases, especially
for the length_limit boundary case?  such as: char array with length 17
  
  for (int i = 0; i < 17; i++)
    c[i] = a[i] + b[i];

The next_len is computed as zero, it's intended to be used for new length in the
next loop iteration, but it's also used for loop exit check, the loop would end
unexpectedly.

Could you have a double check?

IIUC, one equivalent change as v1 seems to just update the loop_len at the
beginning of the head of the loop with bias -1, which ensures all its following
usesites adopt the same biased length.

> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..705439cb380 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,21 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false).require ();
> +
> +  signed char partial_load_bias = internal_len_load_bias
> +    (IFN_LEN_LOAD, len_load_mode);
> +
> +  LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo) = partial_load_bias;
> +

It seems better to add
 
  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
     return false;

to align with the documentation that we only support bias 0 and -1.


> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (partial_load_bias == -1
> +      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +      return false;
> +

Maybe it's better to move LOOP_VINFO_PARTIAL_LOAD_BIAS setting after
the checkings and assert that we have the same bias for len_store, such as:

  signed char partial_store_bias = internal_len_load_store_bias (IFN_LEN_STORE, len_load_mode);
  gcc_assert (partial_load_bias == partial_store_bias);

  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;


>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> @@ -4125,6 +4140,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	 here.  */
>  
>        bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
> +      signed char partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo);
>        bool need_iterate_p
>  	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>  	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
> @@ -4157,6 +4173,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	       for each since start index is zero.  */
>  	    prologue_stmts += num_vectors;
>  
> +	    /* If we have a non-zero partial load bias, we need one MINUS
> +	       and a MAX to adjust the load length.  */
> +	    if (partial_load_bias != 0)
> +	      prologue_stmts += 2;
> +
>  	    /* Each may need two MINs and one MINUS to update lengths in body
>  	       for next iteration.  */
>  	    if (need_iterate_p)
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 17849b575b7..57ecafb7965 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo,
>  						   gsi);
>  		      vec_oprnd = var;
>  		    }
> +
> +		  /* Check which bias value to use.  */
> +		  char biasval = internal_len_load_bias
> +		    (IFN_LEN_STORE, new_vmode);

Nit: seems better to align with other places, s/char/signed char/,
same for below.

> +
> +		  tree bias = build_int_cst (intQI_type_node, biasval);
>  		  gcall *call
> -		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
> -						  ptr, final_len, vec_oprnd);
> +		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> +						  ptr, final_len, vec_oprnd,
> +						  bias);
>  		  gimple_call_set_nothrow (call, true);
>  		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>  		  new_stmt = call;
> @@ -9588,24 +9595,32 @@ vectorizable_load (vec_info *vinfo,
>  					       vec_num * j + i);
>  			tree ptr = build_int_cst (ref_type,
>  						  align * BITS_PER_UNIT);
> +
> +			machine_mode vmode = TYPE_MODE (vectype);
> +			opt_machine_mode new_ovmode
> +			  = get_len_load_store_mode (vmode, true);
> +			machine_mode new_vmode = new_ovmode.require ();
> +			tree qi_type = unsigned_intQI_type_node;
> +			tree new_vtype
> +			  = build_vector_type_for_mode (qi_type, new_vmode);

Nit: this new_vtype assignment could still stay in the below hunk ...

> +
> +			/* Check which bias value to use.  */
> +			char biasval = internal_len_load_bias
> +			  (IFN_LEN_LOAD, new_vmode);
> +
> +			tree bias = build_int_cst (intQI_type_node, biasval);
> +
>  			gcall *call
> -			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
> +			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>  							dataref_ptr, ptr,
> -							final_len);
> +							final_len, bias);
>  			gimple_call_set_nothrow (call, true);
>  			new_stmt = call;
>  			data_ref = NULL_TREE;
>  
>  			/* Need conversion if it's wrapped with VnQI.  */
> -			machine_mode vmode = TYPE_MODE (vectype);
> -			opt_machine_mode new_ovmode
> -			  = get_len_load_store_mode (vmode, true);
> -			machine_mode new_vmode = new_ovmode.require ();
>  			if (vmode != new_vmode)
>  			  {
> -			    tree qi_type = unsigned_intQI_type_node;
> -			    tree new_vtype
> -			      = build_vector_type_for_mode (qi_type, new_vmode);

here.

BR,
Kewen

>  			    tree var = vect_get_new_ssa_name (new_vtype,
>  							      vect_simple_var);
>  			    gimple_set_lhs (call, var);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index c4c5678e7f1..b59f5f6643c 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -749,6 +749,11 @@ public:
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
>  
> +  /* The bias for len_load and len_store.  For now, only 0 and -1 are
> +     supported.  -1 must be used when a backend does not support
> +     len_load/len_store with a length of zero.  */
> +  signed char partial_load_bias;
> +
>    /* When we have grouped data accesses with gaps, we may introduce invalid
>       memory accesses.  We peel the last iteration of the loop to prevent
>       this.  */
> @@ -814,6 +819,7 @@ public:
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
> +#define LOOP_VINFO_PARTIAL_LOAD_BIAS(L)    (L)->partial_load_bias
>  #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
>  #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
>  #define LOOP_VINFO_MASKS(L)                (L)->masks
  
Robin Dapp Nov. 12, 2021, 9:56 a.m. UTC | #5
Hi Kewen and Richard,

the attached v3 addresses the comments to v2, among others:

 - Rename to load_store where appropriate.
 - Save the adjusted length as a separate control that is used instead
of loop_len with a bias != 0 and added to the loop header.
 - Update the costs to reflect a bias.

Bootstrap and regtest were fine on z15 and p9.

Regards
 Robin
  
Kewen.Lin Nov. 22, 2021, 6:55 a.m. UTC | #6
Hi Robin,

on 2021/11/12 下午5:56, Robin Dapp wrote:
> Hi Kewen and Richard,
> 
> the attached v3 addresses the comments to v2, among others:
> 
>  - Rename to load_store where appropriate.
>  - Save the adjusted length as a separate control that is used instead
> of loop_len with a bias != 0 and added to the loop header.
>  - Update the costs to reflect a bias.
> 
> Bootstrap and regtest were fine on z15 and p9.
> 

Nice!  Some minor comments are inlined below.

> Regards
>  Robin
> 
> 
> vll-v3.patch
> 

...

>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>  				    bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 4988c93fdb6..931378820ac 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>  static tree
>  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  				 gimple_seq *preheader_seq,
> +				 gimple_seq *header_seq,
>  				 gimple_stmt_iterator loop_cond_gsi,
>  				 rgroup_controls *rgc, tree niters,
>  				 tree niters_skip, bool might_wrap_p)
> @@ -436,7 +437,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    tree length_limit = NULL_TREE;
>    /* For length, we need length_limit to ensure length in range.  */
>    if (!use_masks_p)
> -    length_limit = build_int_cst (compare_type, nitems_per_ctrl);
> +      length_limit = build_int_cst (compare_type, nitems_per_ctrl);
>  

Nit, seems like an unintentional change.

>    /* Calculate the maximum number of item values that the rgroup
>       handles in total, the number that it handles for each iteration
> @@ -560,8 +561,9 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>      {
>        /* Previous controls will cover BIAS items.  This control covers the
>  	 next batch.  */
> +      tree bias_tree;
>        poly_uint64 bias = nitems_per_ctrl * i;
> -      tree bias_tree = build_int_cst (compare_type, bias);
> +      bias_tree = build_int_cst (compare_type, bias);
>  

Same as above.

>        /* See whether the first iteration of the vector loop is known
>  	 to have a full control.  */
> @@ -664,6 +666,20 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  
>        vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
>      }
> +
> +  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +  if (partial_load_bias != 0
> +      && partial_load_bias != VECT_PARTIAL_BIAS_UNSUPPORTED)
> +    {

IIUC, we don't need to check VECT_PARTIAL_BIAS_UNSUPPORTED again?  Since it's
at the stage of transformation, we have checked it before for sure?

> +      tree adjusted_len = rgc->bias_adjusted_ctrl;
> +      gassign *minus = gimple_build_assign (adjusted_len, MINUS_EXPR,
> +					    rgc->controls[0],
> +					    build_int_cst
> +					    (TREE_TYPE (rgc->controls[0]),
> +					     -partial_load_bias));
> +      gimple_seq_add_stmt (header_seq, minus);
> +    }
> +
>    return next_ctrl;
>  }
>  
> @@ -744,6 +760,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>  	/* Set up all controls for this group.  */
>  	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>  						     &preheader_seq,
> +						     &header_seq,
>  						     loop_cond_gsi, rgc,
>  						     niters, niters_skip,
>  						     might_wrap_p);
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..ceeb6920871 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, true).require ();
> +  machine_mode len_store_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false).require ();
> +
> +  signed char partial_load_bias = internal_len_load_store_bias
> +    (IFN_LEN_LOAD, len_load_mode);
> +
> +  signed char partial_store_bias = internal_len_load_store_bias
> +    (IFN_LEN_STORE, len_store_mode);
> +
> +  gcc_assert (partial_load_bias == partial_store_bias);
> +
> +  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> +    return false;
> +
> +  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +

Nit, it seems better to move this assignment after the below early return.

> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (partial_load_bias == -1
> +      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +      return false;
> +
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> @@ -4125,6 +4150,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	 here.  */
>  
>        bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
> +      signed char partial_load_store_bias
> +	= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>        bool need_iterate_p
>  	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>  	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
> @@ -4157,6 +4184,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	       for each since start index is zero.  */
>  	    prologue_stmts += num_vectors;
>  
> +	    /* If we have a non-zero partial load bias, we need one MINUS
> +	       and a MAX to adjust the load length.  */
> +	    if (partial_load_store_bias != 0)
> +	      prologue_stmts += 2;
> +

IIUC, now we use the biasing length based on the PHI-ed loop_len, it has only one extra
unit cost for MINUS and it's part of cost for body instead of prologue?

>  	    /* Each may need two MINs and one MINUS to update lengths in body
>  	       for next iteration.  */
>  	    if (need_iterate_p)
> @@ -9226,6 +9258,13 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>  {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>  
> +  signed char partial_load_store_bias =
> +    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +
> +  bool use_bias_adjusted_len =
> +    partial_load_store_bias != VECT_PARTIAL_BIAS_UNSUPPORTED
> +    && partial_load_store_bias != 0;
> +

Nit, VECT_PARTIAL_BIAS_UNSUPPORTED check seems redundant here too.

BR,
Kewen
  
Robin Dapp Dec. 7, 2021, 11:52 a.m. UTC | #7
Hi Kewen, Richard,

thanks for the comments, I addressed them in the attached v4.

Bootstrap and regtest are good as before.

Regards
 Robin
  
Richard Sandiford Dec. 10, 2021, 7:21 p.m. UTC | #8
Robin Dapp <rdapp@linux.ibm.com> writes:
> Hi Kewen, Richard,
>
> thanks for the comments, I addressed them in the attached v4.

Sorry again for the slow review.  I only have some very minor
comments left:

> Bootstrap and regtest are good as before.
>
> Regards
>  Robin
>
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index bf033e31c1c..dc2756f83e9 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5637,7 +5637,8 @@
>  (define_expand "len_load_v16qi"
>    [(match_operand:V16QI 0 "vlogical_operand")
>     (match_operand:V16QI 1 "memory_operand")
> -   (match_operand:QI 2 "gpc_reg_operand")]
> +   (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {
>    rtx mem = XEXP (operands[1], 0);
> @@ -5651,6 +5652,7 @@
>    [(match_operand:V16QI 0 "memory_operand")
>     (match_operand:V16QI 1 "vlogical_operand")
>     (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")
>    ]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 2b41cb7fb7b..8df61f578bd 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5213,7 +5213,10 @@ which must be a vector mode.  Operand 2 has whichever integer mode the
>  target prefers.  If operand 2 exceeds the number of elements in mode
>  @var{m}, the behavior is undefined.  If the target prefers the length
>  to be measured in bytes rather than elements, it should only implement
> -this pattern for vectors of @code{QI} elements.
> +this pattern for vectors of @code{QI} elements.  Operand 3 specifies
> +a bias predicate that determines whether a length of zero is permitted
> +or not.  If permitted, the predicate should only allow a zero immediate,
> +otherwise it should only allow an immediate value of -1.

I think it would be better to fold this into the existing documentation
a bit more:

--------------------------------------------------------------------------
Load (operand 2 - operand 3) elements from vector memory operand 1 into
vector register operand 0, setting the other elements of operand 0 to
undefined values.  Operands 0 and 1 have mode @var{m}, which must be a
vector mode.  Operand 2 has whichever integer mode the target prefers.
Operand 3 conceptually has mode @code{QI}.

Operand 2 can be a variable or a constant amount.  Operand 3 specifies
a constant bias: it is either a constant 0 or a constant -1.  The predicate
on operand 3 must only accept the bias values that the target actually
supports.  GCC handles a bias of 0 more efficiently than a bias of -1.

If (operand 2 - operand 3) exceeds the number of elements in mode @var{m},
the behavior is undefined.

If the target prefers the length to be measured in bytes rather than
elements, it should only implement this pattern for vectors of @code{QI}
elements.
--------------------------------------------------------------------------

> @@ -5226,7 +5229,10 @@ a vector mode.  Operand 2 has whichever integer mode the target prefers.
>  If operand 2 exceeds the number of elements in mode @var{m}, the behavior
>  is undefined.  If the target prefers the length to be measured in bytes
>  rather than elements, it should only implement this pattern for vectors
> -of @code{QI} elements.
> +of @code{QI} elements.  Operand 3 specifies a bias predicate that
> +determines whether a length of zero is permitted or not.  If permitted,
> +the predicate should only allow a zero constant, otherwise it should
> +only allow an immediate value of -1.
>  
>  This pattern is not allowed to @code{FAIL}.

Similarly here I think we should say:

--------------------------------------------------------------------------
Store (operand 2 - operand 3) vector elements from vector register
operand 1 into memory operand 0, leaving the other elements of operand 0
unchanged.  Operands 0 and 1 have mode @var{m}, which must be a vector mode.
Operand 2 has whichever integer mode the target prefers.  Operand 3
conceptually has mode @code{QI}.

Operand 2 can be a variable or a constant amount.  Operand 3 specifies
a constant bias: it is either a constant 0 or a constant -1.  The predicate
on operand 3 must only accept the bias values that the target actually
supports.  GCC handles a bias of 0 more efficiently than a bias of -1.

If (operand 2 - operand 3) exceeds the number of elements in mode @var{m},
the behavior is undefined.

If the target prefers the length to be measured in bytes rather than
elements, it should only implement this pattern for vectors of @code{QI}
elements.
--------------------------------------------------------------------------

> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8312d08aab2..d45f080c06f 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
>  static void
>  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, target, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, target, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    create_output_operand (&ops[0], target, TYPE_MODE (type));
>    create_fixed_operand (&ops[1], mem);
>    if (optab == len_load_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> -				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 3);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> +    {
>      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    expand_insn (icode, 3, ops);
> +    }
> +

Formatting nit: the “else” body should be indented like the “then” body.

>    if (!rtx_equal_p (target, ops[0].value))
>      emit_move_insn (target, ops[0].value);
>  }
> @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  static void
>  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, reg, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, reg, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    create_fixed_operand (&ops[0], mem);
>    create_input_operand (&ops[1], reg, TYPE_MODE (type));
>    if (optab == len_store_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> -				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 4);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> -    create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    {
> +      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      expand_insn (icode, 3, ops);
> +    }
>  }
>  
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
> @@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
>  	  && insn_operand_matches (icode, 4, GEN_INT (align)));
>  }
>  
> +/* Return the supported bias for the len_load IFN.  For now we only support

“for IFN, which is either IFN_LEN_LOAD or IFN_LEN_STORE.”

> +   the biases of 0 and -1 (in case 0 is not an allowable length for len_load).

…or len_store

> +   If none of the biases match what the backend provides, return
> +   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
> +
> +signed char
> +internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
> +{
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      /* For now we only support biases of 0 or -1.  Try both of them.  */
> +      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +	return 0;
> +      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
> +	return -1;
> +    }
> +
> +  return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +}
> +
>  /* Expand STMT as though it were a call to internal function FN.  */
>  
>  void
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 19d0f849a5a..d46aa4ebf33 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>  						    tree, tree, int);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>  						poly_uint64, unsigned int);
> +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
> +
> +extern signed char internal_len_load_store_bias (internal_fn ifn,
> +						 machine_mode);
>  
>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>  				    bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
> index a94490373c3..c4a6492b50d 100644
> --- a/gcc/tree-ssa-math-opts.c
> +++ b/gcc/tree-ssa-math-opts.c
> @@ -2723,16 +2723,7 @@ convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
>  	  from_unsigned1 = from_unsigned2 = false;
>  	}
>        else
> -	{
> -	  /* Expand can synthesize smul_widen_optab if the target
> -	     supports umul_widen_optab.  */
> -	  op = umul_widen_optab;
> -	  handler = find_widening_optab_handler_and_mode (op, to_mode,
> -							  from_mode,
> -							  &actual_mode);
> -	  if (handler == CODE_FOR_nothing)
> -	    return false;
> -	}
> +	return false;

This part looks unrelated.

>      }
>  
>    /* Ensure that the inputs to the handler are in the correct precison
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 4988c93fdb6..4fd078d535a 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
>  static tree
>  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  				 gimple_seq *preheader_seq,
> +				 gimple_seq *header_seq,
>  				 gimple_stmt_iterator loop_cond_gsi,
>  				 rgroup_controls *rgc, tree niters,
>  				 tree niters_skip, bool might_wrap_p)
> @@ -664,6 +665,19 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  
>        vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
>      }
> +
> +  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +  if (partial_load_bias != 0)
> +    {
> +      tree adjusted_len = rgc->bias_adjusted_ctrl;
> +      gassign *minus = gimple_build_assign (adjusted_len, MINUS_EXPR,
> +					    rgc->controls[0],
> +					    build_int_cst
> +					    (TREE_TYPE (rgc->controls[0]),
> +					     -partial_load_bias));

It would be more canonical to use PLUS_EXPR and not negate the bias.
MINUS_EXPR of a constant gets converted to PLUS_EXPR of the negative.

> +      gimple_seq_add_stmt (header_seq, minus);
> +    }
> +
>    return next_ctrl;
>  }
>  
> @@ -744,6 +758,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>  	/* Set up all controls for this group.  */
>  	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>  						     &preheader_seq,
> +						     &header_seq,
>  						     loop_cond_gsi, rgc,
>  						     niters, niters_skip,
>  						     might_wrap_p);
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..8eb1726b4f5 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, true).require ();
> +  machine_mode len_store_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false).require ();
> +
> +  signed char partial_load_bias = internal_len_load_store_bias
> +    (IFN_LEN_LOAD, len_load_mode);
> +
> +  signed char partial_store_bias = internal_len_load_store_bias
> +    (IFN_LEN_STORE, len_store_mode);
> +
> +  gcc_assert (partial_load_bias == partial_store_bias);
> +
> +  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> +    return false;
> +
> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (partial_load_bias == -1
> +      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +      return false;

Nit: return is indented two spaces too far.

> +
> +  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> @@ -4125,6 +4150,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	 here.  */
>  
>        bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
> +      signed char partial_load_store_bias
> +	= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>        bool need_iterate_p
>  	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>  	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
> @@ -4157,6 +4184,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	       for each since start index is zero.  */
>  	    prologue_stmts += num_vectors;
>  
> +	    /* If we have a non-zero partial load bias, we need one MINUS

Becomes PLUS after the change above.

> +	       to adjust the load length.  */
> +	    if (partial_load_store_bias != 0)
> +	      body_stmts += 1;
> +
>  	    /* Each may need two MINs and one MINUS to update lengths in body
>  	       for next iteration.  */
>  	    if (need_iterate_p)
> @@ -9226,6 +9258,11 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>  {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>  
> +  signed char partial_load_store_bias =
> +    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +
> +  bool use_bias_adjusted_len = partial_load_store_bias != 0;
> +
>    /* Populate the rgroup's len array, if this is the first time we've
>       used it.  */
>    if (rgl->controls.is_empty ())
> @@ -9235,6 +9272,15 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>  	{
>  	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>  	  gcc_assert (len_type != NULL_TREE);
> +
> +	  if (i == 0 && use_bias_adjusted_len)
> +	    {
> +	      tree adjusted_len =
> +		make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> +	      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> +	      rgl->bias_adjusted_ctrl = adjusted_len;
> +	    }
> +

Think it would be better to make it:

	  if (use_bias_adjusted_len)
	    {
	      gcc_assert (i == 0);

But do we need to do this?  Code should only care about the final value,
so I didn't think we would need to keep the intermediate unbiased length
alongside the biased one.  (Or maybe we do.  My memory is a bit rusty,
sorry.)

>  	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
>  
>  	  /* Provide a dummy definition until the real one is available.  */
> @@ -9243,7 +9289,10 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>  	}
>      }
>  
> -  return rgl->controls[index];
> +  if (use_bias_adjusted_len)
> +    return rgl->bias_adjusted_ctrl;
> +  else
> +    return rgl->controls[index];
>  }
>  
>  /* Scale profiling counters by estimation for LOOP which is vectorized
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 17849b575b7..006f4c31217 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo,
>  						   gsi);
>  		      vec_oprnd = var;
>  		    }
> +
> +		  /* Check which bias value to use.  */
> +		  signed char biasval = internal_len_load_store_bias
> +		    (IFN_LEN_STORE, new_vmode);

I think we talked earlier about asserting that the biases agree
with the one chosen by vect_verify_loop_lens.

This is probably going back on an earlier decision, sorry, but I think
we should either do that or (simpler) just use
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo).  We'll then ICE later
if the target doesn't accept the bias.

> +
> +		  tree bias = build_int_cst (intQI_type_node, biasval);
>  		  gcall *call
> -		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
> -						  ptr, final_len, vec_oprnd);
> +		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> +						  ptr, final_len, vec_oprnd,
> +						  bias);
>  		  gimple_call_set_nothrow (call, true);
>  		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>  		  new_stmt = call;
> @@ -9588,22 +9595,30 @@ vectorizable_load (vec_info *vinfo,
>  					       vec_num * j + i);
>  			tree ptr = build_int_cst (ref_type,
>  						  align * BITS_PER_UNIT);
> +
> +			machine_mode vmode = TYPE_MODE (vectype);
> +			opt_machine_mode new_ovmode
> +			  = get_len_load_store_mode (vmode, true);
> +			machine_mode new_vmode = new_ovmode.require ();
> +			tree qi_type = unsigned_intQI_type_node;
> +
> +			/* Check which bias value to use.  */
> +			signed char biasval = internal_len_load_store_bias
> +			  (IFN_LEN_LOAD, new_vmode);

Same here.

Thanks,
Richard

> +
> +			tree bias = build_int_cst (intQI_type_node, biasval);
> +
>  			gcall *call
> -			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
> +			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>  							dataref_ptr, ptr,
> -							final_len);
> +							final_len, bias);
>  			gimple_call_set_nothrow (call, true);
>  			new_stmt = call;
>  			data_ref = NULL_TREE;
>  
>  			/* Need conversion if it's wrapped with VnQI.  */
> -			machine_mode vmode = TYPE_MODE (vectype);
> -			opt_machine_mode new_ovmode
> -			  = get_len_load_store_mode (vmode, true);
> -			machine_mode new_vmode = new_ovmode.require ();
>  			if (vmode != new_vmode)
>  			  {
> -			    tree qi_type = unsigned_intQI_type_node;
>  			    tree new_vtype
>  			      = build_vector_type_for_mode (qi_type, new_vmode);
>  			    tree var = vect_get_new_ssa_name (new_vtype,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index c4c5678e7f1..75fdfa31405 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -544,6 +544,10 @@ struct rgroup_controls {
>  
>    /* A vector of nV controls, in iteration order.  */
>    vec<tree> controls;
> +
> +  /* In case of len_load and len_store with a bias there is only one
> +     rgroup.  This holds the adjusted loop length for the this rgroup.  */
> +  tree bias_adjusted_ctrl;
>  };
>  
>  typedef auto_vec<rgroup_controls> vec_loop_masks;
> @@ -749,6 +753,11 @@ public:
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
>  
> +  /* The bias for len_load and len_store.  For now, only 0 and -1 are
> +     supported.  -1 must be used when a backend does not support
> +     len_load/len_store with a length of zero.  */
> +  signed char partial_load_store_bias;
> +
>    /* When we have grouped data accesses with gaps, we may introduce invalid
>       memory accesses.  We peel the last iteration of the loop to prevent
>       this.  */
> @@ -814,6 +823,7 @@ public:
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
> +#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
>  #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
>  #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
>  #define LOOP_VINFO_MASKS(L)                (L)->masks
  
Robin Dapp Dec. 13, 2021, 6:51 p.m. UTC | #9
Hi Richard,

I incorporated all your remarks (sorry for the hunk from a different
branch) except for this one:

> Think it would be better to make it:
> 
> 	  if (use_bias_adjusted_len)
> 	    {
> 	      gcc_assert (i == 0);
> 
> But do we need to do this?  Code should only care about the final value,
> so I didn't think we would need to keep the intermediate unbiased length
> alongside the biased one.  (Or maybe we do.  My memory is a bit rusty,
> sorry.)

I'd agree that we generally don't need to keep the unbiased length.
However "loop_len" being a phi node, I wasn't sure how or where to
properly apply the bias (except via creating a new variable like I did).
 Would be thankful about a pointer here.

Thanks
 Robin
  
Richard Sandiford Dec. 14, 2021, 11:47 a.m. UTC | #10
Robin Dapp <rdapp@linux.ibm.com> writes:
> Hi Richard,
>
> I incorporated all your remarks (sorry for the hunk from a different
> branch) except for this one:
>
>> Think it would be better to make it:
>> 
>> 	  if (use_bias_adjusted_len)
>> 	    {
>> 	      gcc_assert (i == 0);
>> 
>> But do we need to do this?  Code should only care about the final value,
>> so I didn't think we would need to keep the intermediate unbiased length
>> alongside the biased one.  (Or maybe we do.  My memory is a bit rusty,
>> sorry.)
>
> I'd agree that we generally don't need to keep the unbiased length.
> However "loop_len" being a phi node, I wasn't sure how or where to
> properly apply the bias (except via creating a new variable like I did).
>  Would be thankful about a pointer here.

Ah, I see.  Yeah, I guess the loop manip stuff does still need
access to the unbiased value, so I agree we should keep both.

Thanks,
Richard
  
Robin Dapp Jan. 10, 2022, 12:53 p.m. UTC | #11
Hi Richard,

> I think it would be better to fold this into the existing documentation
> a bit more:
[..]

done.

Fixed the remaining nits in the attached v5.

Bootstrap and regtest are good on s390x, Power9 and i386.

Regards
 Robin

--

gcc/ChangeLog:

        * config/rs6000/vsx.md: Use const0 bias predicate.
        * doc/md.texi: Document bias value.
        * internal-fn.c (expand_partial_load_optab_fn): Add bias.
        (expand_partial_store_optab_fn): Likewise.
        (internal_len_load_store_bias): New function.
        * internal-fn.h (VECT_PARTIAL_BIAS_UNSUPPORTED): New define.
        (internal_len_load_store_bias): New function.
        * tree-vect-loop-manip.c (vect_set_loop_controls_directly): Set
bias.
        (vect_set_loop_condition_partial_vectors): Add header_seq parameter.
        * tree-vect-loop.c (vect_verify_loop_lens): Verify bias.
        (vect_estimate_min_profitable_iters): Account for bias.
        (vect_get_loop_len): Add bias-adjusted length.
        * tree-vect-stmts.c (vectorizable_store): Use.
        (vectorizable_load): Use.
        * tree-vectorizer.h (struct rgroup_controls): Add bias-adjusted
length.
        (LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS): New macro.
  
Richard Sandiford Jan. 10, 2022, 2:21 p.m. UTC | #12
Robin Dapp <rdapp@linux.ibm.com> writes:
> Hi Richard,
>
>> I think it would be better to fold this into the existing documentation
>> a bit more:
> [..]
>
> done.
>
> Fixed the remaining nits in the attached v5.
>
> Bootstrap and regtest are good on s390x, Power9 and i386.
>
> Regards
>  Robin
>
> --
>
> gcc/ChangeLog:
>
>         * config/rs6000/vsx.md: Use const0 bias predicate.
>         * doc/md.texi: Document bias value.
>         * internal-fn.c (expand_partial_load_optab_fn): Add bias.
>         (expand_partial_store_optab_fn): Likewise.
>         (internal_len_load_store_bias): New function.
>         * internal-fn.h (VECT_PARTIAL_BIAS_UNSUPPORTED): New define.
>         (internal_len_load_store_bias): New function.
>         * tree-vect-loop-manip.c (vect_set_loop_controls_directly): Set
> bias.
>         (vect_set_loop_condition_partial_vectors): Add header_seq parameter.
>         * tree-vect-loop.c (vect_verify_loop_lens): Verify bias.
>         (vect_estimate_min_profitable_iters): Account for bias.
>         (vect_get_loop_len): Add bias-adjusted length.
>         * tree-vect-stmts.c (vectorizable_store): Use.
>         (vectorizable_load): Use.
>         * tree-vectorizer.h (struct rgroup_controls): Add bias-adjusted
> length.
>         (LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS): New macro.

OK, thanks!

Richard
  

Patch

commit 18a5fcd0f8835247e86d86fb018789fe755404be
Author: Robin Dapp <rdapp@linux.ibm.com>
Date:   Wed Oct 27 11:42:11 2021 +0200

    vect: Add bias parameter for partial vectorization
    
    This adds a bias parameter for LEN_LOAD and LEN_STORE as well as the
    corresponding internal functions.  A bias of 0 represents the status
    quo, while -1 is used for the s390 vll instruction that expects the
    highest byte to load rather than the number of bytes to load.
    Backends need to support one of these biases via an operand predicate
    and the vectorizer will then emit the appropriate variant.

diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8312d08aab2..993e32c1854 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -2696,9 +2696,9 @@  expand_call_mem_ref (tree type, gcall *stmt, int index)
 static void
 expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 {
-  class expand_operand ops[3];
-  tree type, lhs, rhs, maskt;
-  rtx mem, target, mask;
+  class expand_operand ops[4];
+  tree type, lhs, rhs, maskt, biast;
+  rtx mem, target, mask, bias;
   insn_code icode;
 
   maskt = gimple_call_arg (stmt, 2);
@@ -2727,7 +2727,16 @@  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
   else
     create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
-  expand_insn (icode, 3, ops);
+  if (optab == len_load_optab)
+    {
+      biast = gimple_call_arg (stmt, 3);
+      bias = expand_normal (biast);
+      create_input_operand (&ops[3], bias, QImode);
+      expand_insn (icode, 4, ops);
+    }
+  else
+    expand_insn (icode, 3, ops);
+
   if (!rtx_equal_p (target, ops[0].value))
     emit_move_insn (target, ops[0].value);
 }
@@ -2741,9 +2750,9 @@  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 static void
 expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 {
-  class expand_operand ops[3];
-  tree type, lhs, rhs, maskt;
-  rtx mem, reg, mask;
+  class expand_operand ops[4];
+  tree type, lhs, rhs, maskt, biast;
+  rtx mem, reg, mask, bias;
   insn_code icode;
 
   maskt = gimple_call_arg (stmt, 2);
@@ -2770,7 +2779,16 @@  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
   else
     create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
-  expand_insn (icode, 3, ops);
+
+  if (optab == len_store_optab)
+    {
+      biast = gimple_call_arg (stmt, 4);
+      bias = expand_normal (biast);
+      create_input_operand (&ops[3], bias, QImode);
+      expand_insn (icode, 4, ops);
+    }
+  else
+    expand_insn (icode, 3, ops);
 }
 
 #define expand_mask_store_optab_fn expand_partial_store_optab_fn
@@ -4172,6 +4190,30 @@  internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
 	  && insn_operand_matches (icode, 4, GEN_INT (align)));
 }
 
+/* Return the supported bias for the len_load IFN.  For now we support a
+   default bias of 0 and -1 in case 0 is not an allowable length for len_load.
+   If none of these biases match what the backend provides, return
+   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
+
+signed char
+internal_len_load_bias_supported (internal_fn ifn, machine_mode mode)
+{
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, mode);
+
+  if (icode != CODE_FOR_nothing)
+    {
+      /* We only support a bias of 0 (default) or -1.  Try both
+	 of them.  */
+      if (insn_operand_matches (icode, 3, GEN_INT (0)))
+	return 0;
+      else if (insn_operand_matches (icode, 3, GEN_INT (-1)))
+	return -1;
+    }
+
+  return VECT_PARTIAL_BIAS_UNSUPPORTED;
+}
+
 /* Expand STMT as though it were a call to internal function FN.  */
 
 void
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 19d0f849a5a..af28cf0d566 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -227,6 +227,10 @@  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
 						    tree, tree, int);
 extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
 						poly_uint64, unsigned int);
+#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
+
+extern signed char internal_len_load_bias_supported (internal_fn ifn,
+						     machine_mode);
 
 extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
 				    bool, bool, bool, bool, tree *);
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e94356d76e9..cd2c33fc4a7 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1163,6 +1163,15 @@  vect_verify_loop_lens (loop_vec_info loop_vinfo)
   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
     return false;
 
+  opt_machine_mode len_load_mode = get_len_load_store_mode
+    (loop_vinfo->vector_mode, false);
+  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
+     len_loads with a length of zero.  In order to avoid that we prohibit
+     more than one loop length here.  */
+  if (internal_len_load_bias_supported (IFN_LEN_LOAD, len_load_mode.require ())
+      == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
+      return false;
+
   unsigned int max_nitems_per_iter = 1;
   unsigned int i;
   rgroup_controls *rgl;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 17849b575b7..c3df26c8009 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -8289,9 +8289,30 @@  vectorizable_store (vec_info *vinfo,
 						   gsi);
 		      vec_oprnd = var;
 		    }
+
+		  /* Check which bias value to use.  Default is 0.
+		     A bias of -1 means that we cannot emit a LEN_LOAD with
+		     a length of 0 and need to subtract 1 from the length.  */
+		  char biasval = internal_len_load_bias_supported
+		    (IFN_LEN_STORE, new_vmode);
+		  tree bias = build_int_cst (intQI_type_node, biasval);
+		  tree new_len = final_len;
+		  if (biasval != 0
+		      && biasval != VECT_PARTIAL_BIAS_UNSUPPORTED)
+		    {
+		      new_len = make_ssa_name (TREE_TYPE (final_len));
+		      gassign *m1
+			= gimple_build_assign (new_len, MINUS_EXPR,
+					       final_len,
+					       build_one_cst (TREE_TYPE
+							  (final_len)));
+		      vect_finish_stmt_generation (vinfo, stmt_info, m1,
+						   gsi);
+		    }
 		  gcall *call
-		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
-						  ptr, final_len, vec_oprnd);
+		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
+						  ptr, new_len, vec_oprnd,
+						  bias);
 		  gimple_call_set_nothrow (call, true);
 		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
 		  new_stmt = call;
@@ -9588,24 +9609,46 @@  vectorizable_load (vec_info *vinfo,
 					       vec_num * j + i);
 			tree ptr = build_int_cst (ref_type,
 						  align * BITS_PER_UNIT);
+
+			machine_mode vmode = TYPE_MODE (vectype);
+			opt_machine_mode new_ovmode
+			  = get_len_load_store_mode (vmode, true);
+			machine_mode new_vmode = new_ovmode.require ();
+			tree qi_type = unsigned_intQI_type_node;
+			tree new_vtype
+			  = build_vector_type_for_mode (qi_type, new_vmode);
+
+			/* Check which bias value to use.  Default is 0. */
+			char biasval = internal_len_load_bias_supported
+			  (IFN_LEN_LOAD, new_vmode);
+
+			tree bias = build_int_cst (intQI_type_node, biasval);
+			tree new_len = final_len;
+			if (biasval != 0
+			    && biasval != VECT_PARTIAL_BIAS_UNSUPPORTED)
+			  {
+			    new_len = make_ssa_name (TREE_TYPE (final_len));
+			    gassign *m1 = gimple_build_assign (new_len,
+							       MINUS_EXPR,
+							       final_len,
+							       build_one_cst
+							       (TREE_TYPE
+								(final_len)));
+			    vect_finish_stmt_generation (vinfo, stmt_info, m1,
+							 gsi);
+			  }
+
 			gcall *call
-			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
+			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
 							dataref_ptr, ptr,
-							final_len);
+							new_len, bias);
 			gimple_call_set_nothrow (call, true);
 			new_stmt = call;
 			data_ref = NULL_TREE;
 
 			/* Need conversion if it's wrapped with VnQI.  */
-			machine_mode vmode = TYPE_MODE (vectype);
-			opt_machine_mode new_ovmode
-			  = get_len_load_store_mode (vmode, true);
-			machine_mode new_vmode = new_ovmode.require ();
 			if (vmode != new_vmode)
 			  {
-			    tree qi_type = unsigned_intQI_type_node;
-			    tree new_vtype
-			      = build_vector_type_for_mode (qi_type, new_vmode);
 			    tree var = vect_get_new_ssa_name (new_vtype,
 							      vect_simple_var);
 			    gimple_set_lhs (call, var);