s390: define single step vector casts

Message ID 20240620070611.11071-1-jchrist@linux.ibm.com
State New
Headers
Series s390: define single step vector casts |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed

Commit Message

Juergen Christ June 20, 2024, 7:06 a.m. UTC
  Some casts were missing leading to missed of bad vectorizations where
casting was done scalar followed by a vector creation from the
individual elements.

gcc/ChangeLog:

	* config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator.
	(vec_half_narrowed): ditto.
	(trunc<VI_TRUNC:mode><vec_half_narrowed>2): New pattern.
	(vec_pack_ufix_trunc_v2df): ditto.
	(vec_pack_sfix_trunc_v2df): ditto.
	(vec_unpack_sfix_trunc_lo_v4sf): ditto.
	(vec_unpack_sfix_trunc_hi_v4sf): ditto.
	(vec_unpack_ufix_trunc_lo_v4sf): ditto.
	(vec_unpack_ufix_trunc_hi_v4sf): ditto.
	(floatv2siv2sf2): ditto.
	(floatunsv2siv2sf2): ditto.
	(vec_unpacks_float_hi_v4si): ditto.
	(vec_unpacks_float_lo_v4si): ditto.
	(vec_unpacku_float_hi_v4si): ditto.
	(vec_unpacku_float_lo_v4si): ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/vec-cast-single.c: New test.
	* gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test.

Bootstrapped and regtested on s390x.  Ok for trunk?

Signed-off-by: Juergen Christ <jchrist@linux.ibm.com>
---
 gcc/config/s390/vector.md                     | 170 ++++++++++-
 .../gcc.target/s390/vector/vec-cast-single.c  | 271 ++++++++++++++++++
 .../s390/vector/vec_pack_ufix_trunc_v2df.c    |  30 ++
 3 files changed, 463 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
  

Comments

Stefan Schulze Frielinghaus June 20, 2024, 3:39 p.m. UTC | #1
On Thu, Jun 20, 2024 at 09:06:11AM +0200, Juergen Christ wrote:
> Some casts were missing leading to missed of bad vectorizations where
> casting was done scalar followed by a vector creation from the
> individual elements.
> 
> gcc/ChangeLog:
> 
> 	* config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator.
> 	(vec_half_narrowed): ditto.
> 	(trunc<VI_TRUNC:mode><vec_half_narrowed>2): New pattern.
> 	(vec_pack_ufix_trunc_v2df): ditto.
> 	(vec_pack_sfix_trunc_v2df): ditto.
> 	(vec_unpack_sfix_trunc_lo_v4sf): ditto.
> 	(vec_unpack_sfix_trunc_hi_v4sf): ditto.
> 	(vec_unpack_ufix_trunc_lo_v4sf): ditto.
> 	(vec_unpack_ufix_trunc_hi_v4sf): ditto.
> 	(floatv2siv2sf2): ditto.
> 	(floatunsv2siv2sf2): ditto.
> 	(vec_unpacks_float_hi_v4si): ditto.
> 	(vec_unpacks_float_lo_v4si): ditto.
> 	(vec_unpacku_float_hi_v4si): ditto.
> 	(vec_unpacku_float_lo_v4si): ditto.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/s390/vector/vec-cast-single.c: New test.
> 	* gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test.
> 
> Bootstrapped and regtested on s390x.  Ok for trunk?
> 
> Signed-off-by: Juergen Christ <jchrist@linux.ibm.com>
> ---
>  gcc/config/s390/vector.md                     | 170 ++++++++++-
>  .../gcc.target/s390/vector/vec-cast-single.c  | 271 ++++++++++++++++++
>  .../s390/vector/vec_pack_ufix_trunc_v2df.c    |  30 ++
>  3 files changed, 463 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
> 
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index 40de0c75a7cf..356f25d26deb 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -89,6 +89,8 @@
>  
>  (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI])
>  
> +(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI])
> +
>  ; Empty string for all but TImode.  This is used to hide the TImode
>  ; expander name in case it is defined already.  See addti3 for an
>  ; example.
> @@ -211,6 +213,14 @@
>  			       (V1SF "v1df") (V2SF "v2df") (V4SF "v4df")
>  			       (V1DF "v1tf") (V2DF "v2tf")])
>  
> +; Vector with narrowed element size and the same number of elements.
> +(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
> +                   (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI")
> +				   (V1DI "V1DI") (V2DI "V2SI")])
> +(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI "v4qi") (V8HI "v8qi")
> +                   (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi")
> +				   (V1DI "v1di") (V2DI "v2si")])
> +
>  ; Vector with half the element size AND half the number of elements.
>  (define_mode_attr vec_halfhalf
>    [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
> @@ -2422,6 +2432,17 @@
>    operands[2] = gen_reg_rtx (V4SFmode);
>  })
>  
> +;; vector truncate
> +
> +; downcasts
> +
> +(define_insn "trunc<VI_TRUNC:mode><vec_half_narrowed>2"
> +  [(set (match_operand:<VEC_HALF_NARROWED> 0 "register_operand" "=v")
> +    (truncate:<VEC_HALF_NARROWED> (match_operand:VI_TRUNC 1 "register_operand" "v")))]
> +  "TARGET_VX"
> +  "vpk<bhfgq>\t %0,%1,%1"
              ~~~~^~~~~
whitespace

> +  [(set_attr "op_type" "VRR")])
> +
>  ;; vector unpack v16qi
>  
>  ; signed
> @@ -3177,17 +3198,150 @@
>    emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
>    emit_insn (gen_vstlv16qi (operands[1], len, mem));
>    DONE;
> -});;
> +})
> +
> +(define_expand "vec_pack_ufix_trunc_v2df"
> +  [(match_operand:V4SI 0 "register_operand")
> +   (match_operand:V2DF 1 "register_operand")
> +   (match_operand:V2DF 2 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r1 = gen_reg_rtx (V2DImode);
> +  rtx r2 = gen_reg_rtx (V2DImode);
> +
> +  emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1]));
> +  emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2]));
> +  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
> +  DONE;
> +})

I haven't really wrapped my head around this, however, this two step
conversion could miss an IEEE-inexact-exception if a double fits into a
64-bit integer but not in a 32-bit integer.  What does the IL/vectorizer
say about exceptions?  Ok to miss some or do we have to guard this by
no-trapping-math et al.?

> +
> +(define_expand "vec_pack_sfix_trunc_v2df"
> +  [(match_operand:V4SI 0 "register_operand")
> +   (match_operand:V2DF 1 "register_operand")
> +   (match_operand:V2DF 2 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r1 = gen_reg_rtx (V2DImode);
> +  rtx r2 = gen_reg_rtx (V2DImode);
> +
> +  emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1]));
> +  emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2]));
> +  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
> +  DONE;
> +})

same as above

> +
> +; v4sf -> v2di
> +(define_expand "vec_unpack_sfix_trunc_lo_v4sf"
> +  [(match_operand:V2DI 0 "register_operand")
> +   (match_operand:V4SF 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V4SImode);
                     ~~~^~~~
whitespace

> +
> +  emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1]));
> +  emit_insn (gen_vec_unpacks_lo_v4si (operands[0], r));
> +  DONE;
> +})

The wording of the internals document leaves some room for
interpretation.  When is widening supposed to happen?  The point in time
when widening happens makes a difference if a rounded float does not fit
in a 32-bit integer but in a 64-bit integer.  My gut feeling is that the
current implementation is correct, i.e., first converting a 32-bit float
into a 32-bit integer and then extending it to a 64-bit integer.

> +
> +(define_expand "vec_unpack_sfix_trunc_hi_v4sf"
> +  [(match_operand:V2DI 0 "register_operand")
> +   (match_operand:V4SF 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V4SImode);
                     ~~~^~~~
whitespace

> +
> +  emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1]));
> +  emit_insn (gen_vec_unpacks_hi_v4si (operands[0], r));
> +  DONE;
> +})

same as above

> +
> +(define_expand "vec_unpack_ufix_trunc_lo_v4sf"
> +  [(match_operand:V2DI 0 "register_operand")
> +   (match_operand:V4SF 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V4SImode);
                     ~~~^~~~
whitespace

> +
> +  emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1]));
> +  emit_insn (gen_vec_unpacku_lo_v4si (operands[0], r));
> +  DONE;
> +})

same as above

> +
> +(define_expand "vec_unpack_ufix_trunc_hi_v4sf"
> +  [(match_operand:V2DI 0 "register_operand")
> +   (match_operand:V4SF 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V4SImode);
                     ~~~^~~~
whitespace

> +
> +  emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1]));
> +  emit_insn (gen_vec_unpacku_hi_v4si (operands[0], r));
> +  DONE;
> +})

same as above

>  
> +(define_insn "floatv2siv2sf2"
> +  [(set (match_operand:V2SF 0 "register_operand"  "=v")
> +	(float:V2SF (match_operand:V2SI 1 "register_operand" "v")))]
> +  "TARGET_VXE2"
> +  "vcefb\t%v0,%v1,0,5"
> +  [(set_attr "op_type" "VRR")])
> +
> +(define_insn "floatunsv2siv2sf2"
> +  [(set (match_operand:V2SF 0 "register_operand"  "=v")
> +	(unsigned_float:V2SF (match_operand:V2SI 1 "register_operand" "v")))]
> +  "TARGET_VXE2"
> +  "vcelfb\t%v0,%v1,0,5"
> +  [(set_attr "op_type" "VRR")])
> +
> +(define_expand "vec_unpacks_float_hi_v4si"
> +  [(match_operand:V2DF 0 "register_operand")
> +   (match_operand:V4SI 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V2DImode);
                     ~~~^~~~
whitespace

> +  
 ~~^~~

trailing whitespace

> +  emit_insn (gen_vec_unpacks_hi_v4si (r, operands[1]));
> +  emit_insn (gen_floatv2div2df2 (operands[0], r));
> +  DONE;
> +})
> +
> +(define_expand "vec_unpacks_float_lo_v4si"
> +  [(match_operand:V2DF 0 "register_operand")
> +   (match_operand:V4SI 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V2DImode);
                     ~~~^~~~
whitespace

> +  
 ~~^~~

trailing whitespace

> +  emit_insn (gen_vec_unpacks_lo_v4si (r, operands[1]));
> +  emit_insn (gen_floatv2div2df2 (operands[0], r));
> +  DONE;
> +})
> +
> +(define_expand "vec_unpacku_float_hi_v4si"
> +  [(match_operand:V2DF 0 "register_operand")
> +   (match_operand:V4SI 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V2DImode);
                     ~~~^~~~
whitespace

> +  
 ~~^~~

trailing whitespace

> +  emit_insn (gen_vec_unpacku_hi_v4si (r, operands[1]));
> +  emit_insn (gen_floatunsv2div2df2 (operands[0], r));
> +  DONE;
> +})
> +
> +(define_expand "vec_unpacku_float_lo_v4si"
> +  [(match_operand:V2DF 0 "register_operand")
> +   (match_operand:V4SI 1 "register_operand")]
> +  "TARGET_VX"
> +{
> +  rtx r = gen_reg_rtx(V2DImode);
                     ~~~^~~~
whitespace

> +  
 ~~^~~

trailing whitespace

> +  emit_insn (gen_vec_unpacku_lo_v4si (r, operands[1]));
> +  emit_insn (gen_floatunsv2div2df2 (operands[0], r));
> +  DONE;
> +})
>  
>  ; reduc_smin
>  ; reduc_smax
>  ; reduc_umin
>  ; reduc_umax
> -
> -; vec_pack_sfix_trunc: convert + pack ?
> -; vec_pack_ufix_trunc
> -; vec_unpacks_float_hi
> -; vec_unpacks_float_lo
> -; vec_unpacku_float_hi
> -; vec_unpacku_float_lo
> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
> new file mode 100644
> index 000000000000..59a154594e9f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
> @@ -0,0 +1,271 @@
> +/* Check that the single-step vector conversions work.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mzarch -march=z15 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-all" } */
> +/* { dg-final { scan-tree-dump-not "conversion not supported by target" "slp" } } */
> +
> +void
> +extendv4hiv4si2 (short *in, int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +void
> +zero_extendv4hiv4si2 (unsigned short *in, unsigned int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +void
> +vec_unpacks_v4si (int *in, long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +void
> +vec_unpacku_v4si (unsigned int *in, unsigned long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +extedv2siv2di2 (int *in, long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +zero_extedv2siv2di2 (unsigned int *in, unsigned long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +truncv4siv4hi2_signed (int *in, short *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +truncv4siv4hi2_unsigned (unsigned int *in, unsigned short *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +truncv2div2si2_signed (long *in, int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +truncv2div2si2_unsigned (unsigned long *in, unsigned int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +fix_truncv4sfv4si2 (float *in, int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +fixuns_truncv4sfv4si2 (float *in, unsigned int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_pack_trunc_v2di__signed (long *in, int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_pack_trunc_v2di__unsigned (unsigned long *in, unsigned int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_pack_sfix_trunc_v2df (double *in, int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_unpack_sfix_trunc (float *in, long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_unpack_ufix_trunc (float *in, unsigned long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +fix_truncv2dfv2di2 (double *in, long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +fixuns_truncv2dfv2di2 (double *in, unsigned long *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +floatv4hiv4sf2 (short *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +floatunsv4hiv4sf2 (unsigned short *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +floatv4siv4sf2 (int *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +floatunsv4siv4sf2 (unsigned int *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_packs_float_v2di (int *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_packu_float_v2di (unsigned int *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +floatv2div2df2 (long *in, double *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +floatunsv2div2df2 (unsigned long *in, double *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +
> +void
> +floatv2siv2sf2 (int *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +
> +void
> +floatunsv2siv2sf2 (unsigned int *in, float *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +}
> +void
> +vec_unpacks_float_hi_v4si (int *in, double *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +void
> +vec_unpacku_float_hi_v4si (unsigned int *in, double *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
> new file mode 100644
> index 000000000000..4fcfbd88abe4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
> @@ -0,0 +1,30 @@
> +/* Check that vec_pack_ufix_trunc_v2df pattern is correctly used.  Even without
> +   this pattern, we will vectorize this code, but produce wrong output.  */
> +
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mzarch -march=z13 -ftree-vectorize -fvect-cost-model=unlimited" } */
> +
> +__attribute__((noinline,noclone,noipa))
> +void
> +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out);
> +
> +void
> +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
> +{
> +  out[0] = in[0];
> +  out[1] = in[1];
> +  out[2] = in[2];
> +  out[3] = in[3];
> +}
> +
> +int main()
> +{
> +  double in[] = {-1,-2,-3,-4};
> +  unsigned int out[4];
> +
> +  vec_pack_ufix_trunc_v2df (in, out);
> +  for (int i = 0; i < 4; ++i)
> +    if (out[i] != 0)
> +      __builtin_abort();
> +  return 0;
> +}
> -- 
> 2.43.0
>
  
Richard Biener June 20, 2024, 5 p.m. UTC | #2
> Am 20.06.2024 um 17:40 schrieb Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>:
> 
> On Thu, Jun 20, 2024 at 09:06:11AM +0200, Juergen Christ wrote:
>> Some casts were missing leading to missed of bad vectorizations where
>> casting was done scalar followed by a vector creation from the
>> individual elements.
>> 
>> gcc/ChangeLog:
>> 
>>    * config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator.
>>    (vec_half_narrowed): ditto.
>>    (trunc<VI_TRUNC:mode><vec_half_narrowed>2): New pattern.
>>    (vec_pack_ufix_trunc_v2df): ditto.
>>    (vec_pack_sfix_trunc_v2df): ditto.
>>    (vec_unpack_sfix_trunc_lo_v4sf): ditto.
>>    (vec_unpack_sfix_trunc_hi_v4sf): ditto.
>>    (vec_unpack_ufix_trunc_lo_v4sf): ditto.
>>    (vec_unpack_ufix_trunc_hi_v4sf): ditto.
>>    (floatv2siv2sf2): ditto.
>>    (floatunsv2siv2sf2): ditto.
>>    (vec_unpacks_float_hi_v4si): ditto.
>>    (vec_unpacks_float_lo_v4si): ditto.
>>    (vec_unpacku_float_hi_v4si): ditto.
>>    (vec_unpacku_float_lo_v4si): ditto.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>    * gcc.target/s390/vector/vec-cast-single.c: New test.
>>    * gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test.
>> 
>> Bootstrapped and regtested on s390x.  Ok for trunk?
>> 
>> Signed-off-by: Juergen Christ <jchrist@linux.ibm.com>
>> ---
>> gcc/config/s390/vector.md                     | 170 ++++++++++-
>> .../gcc.target/s390/vector/vec-cast-single.c  | 271 ++++++++++++++++++
>> .../s390/vector/vec_pack_ufix_trunc_v2df.c    |  30 ++
>> 3 files changed, 463 insertions(+), 8 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
>> create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
>> 
>> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
>> index 40de0c75a7cf..356f25d26deb 100644
>> --- a/gcc/config/s390/vector.md
>> +++ b/gcc/config/s390/vector.md
>> @@ -89,6 +89,8 @@
>> 
>> (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI])
>> 
>> +(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI])
>> +
>> ; Empty string for all but TImode.  This is used to hide the TImode
>> ; expander name in case it is defined already.  See addti3 for an
>> ; example.
>> @@ -211,6 +213,14 @@
>>                   (V1SF "v1df") (V2SF "v2df") (V4SF "v4df")
>>                   (V1DF "v1tf") (V2DF "v2tf")])
>> 
>> +; Vector with narrowed element size and the same number of elements.
>> +(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
>> +                   (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI")
>> +                   (V1DI "V1DI") (V2DI "V2SI")])
>> +(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI "v4qi") (V8HI "v8qi")
>> +                   (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi")
>> +                   (V1DI "v1di") (V2DI "v2si")])
>> +
>> ; Vector with half the element size AND half the number of elements.
>> (define_mode_attr vec_halfhalf
>>   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
>> @@ -2422,6 +2432,17 @@
>>   operands[2] = gen_reg_rtx (V4SFmode);
>> })
>> 
>> +;; vector truncate
>> +
>> +; downcasts
>> +
>> +(define_insn "trunc<VI_TRUNC:mode><vec_half_narrowed>2"
>> +  [(set (match_operand:<VEC_HALF_NARROWED> 0 "register_operand" "=v")
>> +    (truncate:<VEC_HALF_NARROWED> (match_operand:VI_TRUNC 1 "register_operand" "v")))]
>> +  "TARGET_VX"
>> +  "vpk<bhfgq>\t %0,%1,%1"
>              ~~~~^~~~~
> whitespace
> 
>> +  [(set_attr "op_type" "VRR")])
>> +
>> ;; vector unpack v16qi
>> 
>> ; signed
>> @@ -3177,17 +3198,150 @@
>>   emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
>>   emit_insn (gen_vstlv16qi (operands[1], len, mem));
>>   DONE;
>> -});;
>> +})
>> +
>> +(define_expand "vec_pack_ufix_trunc_v2df"
>> +  [(match_operand:V4SI 0 "register_operand")
>> +   (match_operand:V2DF 1 "register_operand")
>> +   (match_operand:V2DF 2 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r1 = gen_reg_rtx (V2DImode);
>> +  rtx r2 = gen_reg_rtx (V2DImode);
>> +
>> +  emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1]));
>> +  emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2]));
>> +  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
>> +  DONE;
>> +})
> 
> I haven't really wrapped my head around this, however, this two step
> conversion could miss an IEEE-inexact-exception if a double fits into a
> 64-bit integer but not in a 32-bit integer.  What does the IL/vectorizer
> say about exceptions?  Ok to miss some or do we have to guard this by
> no-trapping-math et al.?

Yes, in other places we guard this with !flag_trapping_math

>> +
>> +(define_expand "vec_pack_sfix_trunc_v2df"
>> +  [(match_operand:V4SI 0 "register_operand")
>> +   (match_operand:V2DF 1 "register_operand")
>> +   (match_operand:V2DF 2 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r1 = gen_reg_rtx (V2DImode);
>> +  rtx r2 = gen_reg_rtx (V2DImode);
>> +
>> +  emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1]));
>> +  emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2]));
>> +  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
>> +  DONE;
>> +})
> 
> same as above
> 
>> +
>> +; v4sf -> v2di
>> +(define_expand "vec_unpack_sfix_trunc_lo_v4sf"
>> +  [(match_operand:V2DI 0 "register_operand")
>> +   (match_operand:V4SF 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V4SImode);
>                     ~~~^~~~
> whitespace
> 
>> +
>> +  emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1]));
>> +  emit_insn (gen_vec_unpacks_lo_v4si (operands[0], r));
>> +  DONE;
>> +})
> 
> The wording of the internals document leaves some room for
> interpretation.  When is widening supposed to happen?  The point in time
> when widening happens makes a difference if a rounded float does not fit
> in a 32-bit integer but in a 64-bit integer.  My gut feeling is that the
> current implementation is correct, i.e., first converting a 32-bit float
> into a 32-bit integer and then extending it to a 64-bit integer.
> 
>> +
>> +(define_expand "vec_unpack_sfix_trunc_hi_v4sf"
>> +  [(match_operand:V2DI 0 "register_operand")
>> +   (match_operand:V4SF 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V4SImode);
>                     ~~~^~~~
> whitespace
> 
>> +
>> +  emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1]));
>> +  emit_insn (gen_vec_unpacks_hi_v4si (operands[0], r));
>> +  DONE;
>> +})
> 
> same as above
> 
>> +
>> +(define_expand "vec_unpack_ufix_trunc_lo_v4sf"
>> +  [(match_operand:V2DI 0 "register_operand")
>> +   (match_operand:V4SF 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V4SImode);
>                     ~~~^~~~
> whitespace
> 
>> +
>> +  emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1]));
>> +  emit_insn (gen_vec_unpacku_lo_v4si (operands[0], r));
>> +  DONE;
>> +})
> 
> same as above
> 
>> +
>> +(define_expand "vec_unpack_ufix_trunc_hi_v4sf"
>> +  [(match_operand:V2DI 0 "register_operand")
>> +   (match_operand:V4SF 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V4SImode);
>                     ~~~^~~~
> whitespace
> 
>> +
>> +  emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1]));
>> +  emit_insn (gen_vec_unpacku_hi_v4si (operands[0], r));
>> +  DONE;
>> +})
> 
> same as above
> 
>> 
>> +(define_insn "floatv2siv2sf2"
>> +  [(set (match_operand:V2SF 0 "register_operand"  "=v")
>> +    (float:V2SF (match_operand:V2SI 1 "register_operand" "v")))]
>> +  "TARGET_VXE2"
>> +  "vcefb\t%v0,%v1,0,5"
>> +  [(set_attr "op_type" "VRR")])
>> +
>> +(define_insn "floatunsv2siv2sf2"
>> +  [(set (match_operand:V2SF 0 "register_operand"  "=v")
>> +    (unsigned_float:V2SF (match_operand:V2SI 1 "register_operand" "v")))]
>> +  "TARGET_VXE2"
>> +  "vcelfb\t%v0,%v1,0,5"
>> +  [(set_attr "op_type" "VRR")])
>> +
>> +(define_expand "vec_unpacks_float_hi_v4si"
>> +  [(match_operand:V2DF 0 "register_operand")
>> +   (match_operand:V4SI 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V2DImode);
>                     ~~~^~~~
> whitespace
> 
>> +  
> ~~^~~
> 
> trailing whitespace
> 
>> +  emit_insn (gen_vec_unpacks_hi_v4si (r, operands[1]));
>> +  emit_insn (gen_floatv2div2df2 (operands[0], r));
>> +  DONE;
>> +})
>> +
>> +(define_expand "vec_unpacks_float_lo_v4si"
>> +  [(match_operand:V2DF 0 "register_operand")
>> +   (match_operand:V4SI 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V2DImode);
>                     ~~~^~~~
> whitespace
> 
>> +  
> ~~^~~
> 
> trailing whitespace
> 
>> +  emit_insn (gen_vec_unpacks_lo_v4si (r, operands[1]));
>> +  emit_insn (gen_floatv2div2df2 (operands[0], r));
>> +  DONE;
>> +})
>> +
>> +(define_expand "vec_unpacku_float_hi_v4si"
>> +  [(match_operand:V2DF 0 "register_operand")
>> +   (match_operand:V4SI 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V2DImode);
>                     ~~~^~~~
> whitespace
> 
>> +  
> ~~^~~
> 
> trailing whitespace
> 
>> +  emit_insn (gen_vec_unpacku_hi_v4si (r, operands[1]));
>> +  emit_insn (gen_floatunsv2div2df2 (operands[0], r));
>> +  DONE;
>> +})
>> +
>> +(define_expand "vec_unpacku_float_lo_v4si"
>> +  [(match_operand:V2DF 0 "register_operand")
>> +   (match_operand:V4SI 1 "register_operand")]
>> +  "TARGET_VX"
>> +{
>> +  rtx r = gen_reg_rtx(V2DImode);
>                     ~~~^~~~
> whitespace
> 
>> +  
> ~~^~~
> 
> trailing whitespace
> 
>> +  emit_insn (gen_vec_unpacku_lo_v4si (r, operands[1]));
>> +  emit_insn (gen_floatunsv2div2df2 (operands[0], r));
>> +  DONE;
>> +})
>> 
>> ; reduc_smin
>> ; reduc_smax
>> ; reduc_umin
>> ; reduc_umax
>> -
>> -; vec_pack_sfix_trunc: convert + pack ?
>> -; vec_pack_ufix_trunc
>> -; vec_unpacks_float_hi
>> -; vec_unpacks_float_lo
>> -; vec_unpacku_float_hi
>> -; vec_unpacku_float_lo
>> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
>> new file mode 100644
>> index 000000000000..59a154594e9f
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
>> @@ -0,0 +1,271 @@
>> +/* Check that the single-step vector conversions work.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3 -mzarch -march=z15 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-all" } */
>> +/* { dg-final { scan-tree-dump-not "conversion not supported by target" "slp" } } */
>> +
>> +void
>> +extendv4hiv4si2 (short *in, int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +void
>> +zero_extendv4hiv4si2 (unsigned short *in, unsigned int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +void
>> +vec_unpacks_v4si (int *in, long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +void
>> +vec_unpacku_v4si (unsigned int *in, unsigned long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +extedv2siv2di2 (int *in, long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +zero_extedv2siv2di2 (unsigned int *in, unsigned long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +truncv4siv4hi2_signed (int *in, short *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +truncv4siv4hi2_unsigned (unsigned int *in, unsigned short *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +truncv2div2si2_signed (long *in, int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +truncv2div2si2_unsigned (unsigned long *in, unsigned int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +fix_truncv4sfv4si2 (float *in, int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +fixuns_truncv4sfv4si2 (float *in, unsigned int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_pack_trunc_v2di__signed (long *in, int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_pack_trunc_v2di__unsigned (unsigned long *in, unsigned int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_pack_sfix_trunc_v2df (double *in, int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_unpack_sfix_trunc (float *in, long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_unpack_ufix_trunc (float *in, unsigned long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +fix_truncv2dfv2di2 (double *in, long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +fixuns_truncv2dfv2di2 (double *in, unsigned long *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +floatv4hiv4sf2 (short *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +floatunsv4hiv4sf2 (unsigned short *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +floatv4siv4sf2 (int *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +floatunsv4siv4sf2 (unsigned int *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_packs_float_v2di (int *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_packu_float_v2di (unsigned int *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +floatv2div2df2 (long *in, double *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +floatunsv2div2df2 (unsigned long *in, double *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +
>> +void
>> +floatv2siv2sf2 (int *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +
>> +void
>> +floatunsv2siv2sf2 (unsigned int *in, float *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +}
>> +void
>> +vec_unpacks_float_hi_v4si (int *in, double *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +void
>> +vec_unpacku_float_hi_v4si (unsigned int *in, double *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
>> new file mode 100644
>> index 000000000000..4fcfbd88abe4
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
>> @@ -0,0 +1,30 @@
>> +/* Check that vec_pack_ufix_trunc_v2df pattern is correctly used.  Even without
>> +   this pattern, we will vectorize this code, but produce wrong output.  */
>> +
>> +/* { dg-do run } */
>> +/* { dg-options "-O3 -mzarch -march=z13 -ftree-vectorize -fvect-cost-model=unlimited" } */
>> +
>> +__attribute__((noinline,noclone,noipa))
>> +void
>> +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out);
>> +
>> +void
>> +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
>> +{
>> +  out[0] = in[0];
>> +  out[1] = in[1];
>> +  out[2] = in[2];
>> +  out[3] = in[3];
>> +}
>> +
>> +int main()
>> +{
>> +  double in[] = {-1,-2,-3,-4};
>> +  unsigned int out[4];
>> +
>> +  vec_pack_ufix_trunc_v2df (in, out);
>> +  for (int i = 0; i < 4; ++i)
>> +    if (out[i] != 0)
>> +      __builtin_abort();
>> +  return 0;
>> +}
>> --
>> 2.43.0
>>
  

Patch

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 40de0c75a7cf..356f25d26deb 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -89,6 +89,8 @@ 
 
 (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI])
 
+(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI])
+
 ; Empty string for all but TImode.  This is used to hide the TImode
 ; expander name in case it is defined already.  See addti3 for an
 ; example.
@@ -211,6 +213,14 @@ 
 			       (V1SF "v1df") (V2SF "v2df") (V4SF "v4df")
 			       (V1DF "v1tf") (V2DF "v2tf")])
 
+; Vector with narrowed element size and the same number of elements.
+(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
+                   (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI")
+				   (V1DI "V1DI") (V2DI "V2SI")])
+(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI "v4qi") (V8HI "v8qi")
+                   (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi")
+				   (V1DI "v1di") (V2DI "v2si")])
+
 ; Vector with half the element size AND half the number of elements.
 (define_mode_attr vec_halfhalf
   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -2422,6 +2432,17 @@ 
   operands[2] = gen_reg_rtx (V4SFmode);
 })
 
+;; vector truncate
+
+; downcasts
+
+(define_insn "trunc<VI_TRUNC:mode><vec_half_narrowed>2"
+  [(set (match_operand:<VEC_HALF_NARROWED> 0 "register_operand" "=v")
+    (truncate:<VEC_HALF_NARROWED> (match_operand:VI_TRUNC 1 "register_operand" "v")))]
+  "TARGET_VX"
+  "vpk<bhfgq>\t %0,%1,%1"
+  [(set_attr "op_type" "VRR")])
+
 ;; vector unpack v16qi
 
 ; signed
@@ -3177,17 +3198,150 @@ 
   emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
   emit_insn (gen_vstlv16qi (operands[1], len, mem));
   DONE;
-});;
+})
+
+(define_expand "vec_pack_ufix_trunc_v2df"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V2DF 1 "register_operand")
+   (match_operand:V2DF 2 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r1 = gen_reg_rtx (V2DImode);
+  rtx r2 = gen_reg_rtx (V2DImode);
+
+  emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1]));
+  emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2]));
+  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_trunc_v2df"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V2DF 1 "register_operand")
+   (match_operand:V2DF 2 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r1 = gen_reg_rtx (V2DImode);
+  rtx r2 = gen_reg_rtx (V2DImode);
+
+  emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1]));
+  emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2]));
+  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
+  DONE;
+})
+
+; v4sf -> v2di
+(define_expand "vec_unpack_sfix_trunc_lo_v4sf"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V4SImode);
+
+  emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1]));
+  emit_insn (gen_vec_unpacks_lo_v4si (operands[0], r));
+  DONE;
+})
+
+(define_expand "vec_unpack_sfix_trunc_hi_v4sf"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V4SImode);
+
+  emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1]));
+  emit_insn (gen_vec_unpacks_hi_v4si (operands[0], r));
+  DONE;
+})
+
+(define_expand "vec_unpack_ufix_trunc_lo_v4sf"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V4SImode);
+
+  emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1]));
+  emit_insn (gen_vec_unpacku_lo_v4si (operands[0], r));
+  DONE;
+})
+
+(define_expand "vec_unpack_ufix_trunc_hi_v4sf"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V4SImode);
+
+  emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1]));
+  emit_insn (gen_vec_unpacku_hi_v4si (operands[0], r));
+  DONE;
+})
 
+(define_insn "floatv2siv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand"  "=v")
+	(float:V2SF (match_operand:V2SI 1 "register_operand" "v")))]
+  "TARGET_VXE2"
+  "vcefb\t%v0,%v1,0,5"
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "floatunsv2siv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand"  "=v")
+	(unsigned_float:V2SF (match_operand:V2SI 1 "register_operand" "v")))]
+  "TARGET_VXE2"
+  "vcelfb\t%v0,%v1,0,5"
+  [(set_attr "op_type" "VRR")])
+
+(define_expand "vec_unpacks_float_hi_v4si"
+  [(match_operand:V2DF 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V2DImode);
+  
+  emit_insn (gen_vec_unpacks_hi_v4si (r, operands[1]));
+  emit_insn (gen_floatv2div2df2 (operands[0], r));
+  DONE;
+})
+
+(define_expand "vec_unpacks_float_lo_v4si"
+  [(match_operand:V2DF 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V2DImode);
+  
+  emit_insn (gen_vec_unpacks_lo_v4si (r, operands[1]));
+  emit_insn (gen_floatv2div2df2 (operands[0], r));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_hi_v4si"
+  [(match_operand:V2DF 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V2DImode);
+  
+  emit_insn (gen_vec_unpacku_hi_v4si (r, operands[1]));
+  emit_insn (gen_floatunsv2div2df2 (operands[0], r));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_lo_v4si"
+  [(match_operand:V2DF 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V2DImode);
+  
+  emit_insn (gen_vec_unpacku_lo_v4si (r, operands[1]));
+  emit_insn (gen_floatunsv2div2df2 (operands[0], r));
+  DONE;
+})
 
 ; reduc_smin
 ; reduc_smax
 ; reduc_umin
 ; reduc_umax
-
-; vec_pack_sfix_trunc: convert + pack ?
-; vec_pack_ufix_trunc
-; vec_unpacks_float_hi
-; vec_unpacks_float_lo
-; vec_unpacku_float_hi
-; vec_unpacku_float_lo
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
new file mode 100644
index 000000000000..59a154594e9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
@@ -0,0 +1,271 @@ 
+/* Check that the single-step vector conversions work.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z15 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-all" } */
+/* { dg-final { scan-tree-dump-not "conversion not supported by target" "slp" } } */
+
+void
+extendv4hiv4si2 (short *in, int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+void
+zero_extendv4hiv4si2 (unsigned short *in, unsigned int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+void
+vec_unpacks_v4si (int *in, long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+void
+vec_unpacku_v4si (unsigned int *in, unsigned long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+extedv2siv2di2 (int *in, long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+zero_extedv2siv2di2 (unsigned int *in, unsigned long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+truncv4siv4hi2_signed (int *in, short *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+truncv4siv4hi2_unsigned (unsigned int *in, unsigned short *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+truncv2div2si2_signed (long *in, int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+truncv2div2si2_unsigned (unsigned long *in, unsigned int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+fix_truncv4sfv4si2 (float *in, int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+fixuns_truncv4sfv4si2 (float *in, unsigned int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_pack_trunc_v2di__signed (long *in, int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_pack_trunc_v2di__unsigned (unsigned long *in, unsigned int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_pack_sfix_trunc_v2df (double *in, int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_unpack_sfix_trunc (float *in, long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_unpack_ufix_trunc (float *in, unsigned long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+fix_truncv2dfv2di2 (double *in, long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+fixuns_truncv2dfv2di2 (double *in, unsigned long *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+floatv4hiv4sf2 (short *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+floatunsv4hiv4sf2 (unsigned short *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+floatv4siv4sf2 (int *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+floatunsv4siv4sf2 (unsigned int *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_packs_float_v2di (int *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_packu_float_v2di (unsigned int *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+floatv2div2df2 (long *in, double *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+floatunsv2div2df2 (unsigned long *in, double *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+
+void
+floatv2siv2sf2 (int *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+
+void
+floatunsv2siv2sf2 (unsigned int *in, float *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+}
+void
+vec_unpacks_float_hi_v4si (int *in, double *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+void
+vec_unpacku_float_hi_v4si (unsigned int *in, double *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
new file mode 100644
index 000000000000..4fcfbd88abe4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c
@@ -0,0 +1,30 @@ 
+/* Check that vec_pack_ufix_trunc_v2df pattern is correctly used.  Even without
+   this pattern, we will vectorize this code, but produce wrong output.  */
+
+/* { dg-do run } */
+/* { dg-options "-O3 -mzarch -march=z13 -ftree-vectorize -fvect-cost-model=unlimited" } */
+
+__attribute__((noinline,noclone,noipa))
+void
+vec_pack_ufix_trunc_v2df (double *in, unsigned int *out);
+
+void
+vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
+{
+  out[0] = in[0];
+  out[1] = in[1];
+  out[2] = in[2];
+  out[3] = in[3];
+}
+
+int main()
+{
+  double in[] = {-1,-2,-3,-4};
+  unsigned int out[4];
+
+  vec_pack_ufix_trunc_v2df (in, out);
+  for (int i = 0; i < 4; ++i)
+    if (out[i] != 0)
+      __builtin_abort();
+  return 0;
+}