Add zero_extendditi2. Improve lxvr*x code generation.

Message ID Yk3aJh5dmEgzo4fs@toto.the-meissners.org
State New
Headers
Series Add zero_extendditi2. Improve lxvr*x code generation. |

Commit Message

Michael Meissner April 6, 2022, 6:21 p.m. UTC
  From bf51c49f1481001c7b3223474d261dcbf9365eda Mon Sep 17 00:00:00 2001
From: Michael Meissner <meissner@linux.ibm.com>
Date: Fri, 1 Apr 2022 22:27:13 -0400
Subject: [PATCH] Add zero_extendditi2.  Improve lxvr*x code generation.

This pattern adds zero_extendditi2 so that if we are extending DImode to
TImode, and we want the result in a vector register, the compiler can
generate MTVSRDDD.

In addition the patterns for generating lxvr{b,h,w,d}x were tuned to allow
loading to gpr registers.  This prevents needlessly doing direct moves to
get the value into the vector registers if the gpr register was already
selected.

In updating the insn counts for two tests due to these changes, I noticed
the tests were done at -O0.  I changed this so that the tests are now done
at the normal -O2 optimization level.

I have tested this patch with bootstrap builds and running the regression
testsuite using this patch on:

	Little endian power10, --with-cpu=power10
	Little endian power9, --with-cpu=power9
	Big endian power8, --with-cpu=power8 (both 64/32-bit tests done).

There were no regressions.  Can I check this into the master branch?

2022-04-06   Michael Meissner  <meissner@linux.ibm.com>

gcc/
	* config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading to
	GPR registers.
	(vsx_stxvr<wd>x): Add support for storing from GPR registers.
	(zero_extendditi2): New insn.

gcc/testsuite/
	* gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2
	instead of -O0 and update insn counts.
	* gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise.
	* gcc.target/powerpc/zero-extend-di-ti.c: New test.

---
 gcc/config/rs6000/vsx.md                      | 82 +++++++++++++++++--
 .../powerpc/vsx-load-element-extend-int.c     | 36 ++++----
 .../powerpc/vsx-load-element-extend-short.c   | 35 ++++----
 .../gcc.target/powerpc/zero-extend-di-ti.c    | 62 ++++++++++++++
 4 files changed, 164 insertions(+), 51 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
  

Comments

will schmidt April 6, 2022, 8:01 p.m. UTC | #1
On Wed, 2022-04-06 at 14:21 -0400, Michael Meissner wrote:
> From bf51c49f1481001c7b3223474d261dcbf9365eda Mon Sep 17 00:00:00 2001
> From: Michael Meissner <meissner@linux.ibm.com>
> Date: Fri, 1 Apr 2022 22:27:13 -0400
> Subject: [PATCH] Add zero_extendditi2.  Improve lxvr*x code generation.
> 

Hi,

> This pattern adds zero_extendditi2 so that if we are extending DImode to
> TImode, and we want the result in a vector register, the compiler can
> generate MTVSRDDD.
> 
> In addition the patterns for generating lxvr{b,h,w,d}x were tuned to allow
> loading to gpr registers.  This prevents needlessly doing direct moves to
> get the value into the vector registers if the gpr register was already
> selected.

ok

> 
> In updating the insn counts for two tests due to these changes, I noticed
> the tests were done at -O0.  I changed this so that the tests are now done
> at the normal -O2 optimization level.

Per the comments (which you fixed up later in patch), I note they were
deliberately done at -O0 since under higher optimizations gcc would
generate other load instructions during those tests.  Presumably with
these changes that is no longer the case.  :-)
> 
> I have tested this patch with bootstrap builds and running the regression
> testsuite using this patch on:
> 
> 	Little endian power10, --with-cpu=power10
> 	Little endian power9, --with-cpu=power9
> 	Big endian power8, --with-cpu=power8 (both 64/32-bit tests done).
> 
> There were no regressions.  Can I check this into the master branch?
> 
> 2022-04-06   Michael Meissner  <meissner@linux.ibm.com>
> 
> gcc/
> 	* config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading to
> 	GPR registers.
> 	(vsx_stxvr<wd>x): Add support for storing from GPR registers.
> 	(zero_extendditi2): New insn.
> 
> gcc/testsuite/
> 	* gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2
> 	instead of -O0 and update insn counts.
> 	* gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise.
> 	* gcc.target/powerpc/zero-extend-di-ti.c: New test.
> 
> ---
>  gcc/config/rs6000/vsx.md                      | 82 +++++++++++++++++--
>  .../powerpc/vsx-load-element-extend-int.c     | 36 ++++----
>  .../powerpc/vsx-load-element-extend-short.c   | 35 ++++----
>  .../gcc.target/powerpc/zero-extend-di-ti.c    | 62 ++++++++++++++
>  4 files changed, 164 insertions(+), 51 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> 
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index c091e5e2f47..ad971e3a1de 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>"
>      }
>  })
> 
> -;; Load rightmost element from load_data
> -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
> -(define_insn "vsx_lxvr<wd>x"
> -  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
> -	(zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "Z")))]
> -  "TARGET_POWER10"
> -  "lxvr<wd>x %x0,%y1"
> -  [(set_attr "type" "vecload")])
> +;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
> +;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x
> +;; and then two direct moves if we ultimately need the value in a GPR register.
> +(define_insn_and_split "vsx_lxvr<wd>x"
> +  [(set (match_operand:TI 0 "register_operand" "=r,wa")
> +	(zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "m,Z")))]
> +  "TARGET_POWERPC64 && TARGET_POWER10"
> +  "@
> +   #
> +   lxvr<wd>x %x0,%y1"
> +  "&& reload_completed && int_reg_operand (operands[0], TImode)"
> +  [(set (match_dup 2) (match_dup 3))
> +   (set (match_dup 4) (const_int 0))]
> +{
> +  rtx op0 = operands[0];
> +  rtx op1 = operands[1];
> +
> +  operands[2] = gen_lowpart (DImode, op0);
> +  operands[3] = (<MODE>mode == DImode
> +		 ? op1
> +		 : gen_rtx_ZERO_EXTEND (DImode, op1));
> +
> +  operands[4] = gen_highpart (DImode, op0);
> +}
> +  [(set_attr "type" "load,vecload")
> +   (set_attr "num_insns" "2,*")])
> 
>  ;; Store rightmost element into store_data
>  ;; using stxvrbx, stxvrhx, strvxwx, strvxdx.
> @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di"
>    DONE;
>  })
> 
> +;; Zero extend DI to TI.  If we don't have the MTVSRDD instruction (and LXVRDX
> +;; in the case of power10), we use the machine independent code.  If we are
> +;; loading up GPRs, we fall back to the old code.

In this context it's not clear what is the "old code" ?
The mtvsrdd
instruction is referenced in this code path.  I see no direct reference
to lxvrdx here, though I suppose it's assumed somewhere behind the
emit_ calls.


> +(define_insn_and_split "zero_extendditi2"
> +  [(set (match_operand:TI 0 "register_operand"                 "=r,r, wa,&wa")
> +	(zero_extend:TI (match_operand:DI 1 "register_operand"  "r,wa,r,  wa")))]
> +  "TARGET_POWERPC64 && TARGET_P9_VECTOR"
> +  "@
> +   #
> +   #
> +   mtvsrdd %x0,0,%1
> +   #"
> +  "&& reload_completed
> +   && (int_reg_operand (operands[0], TImode)
> +       || vsx_register_operand (operands[1], DImode))"
> +  [(pc)]
> +{
> +  rtx dest = operands[0];
> +  rtx src = operands[1];
> +  int dest_regno = reg_or_subregno (dest);
> +
> +  /* Handle conversion to GPR registers.  Load up the low part and then do
> +     zero out the upper part.  */
> +  if (INT_REGNO_P (dest_regno))
> +    {
> +      rtx dest_hi = gen_highpart (DImode, dest);
> +      rtx dest_lo = gen_lowpart (DImode, dest);
> +
> +      emit_move_insn (dest_lo, src);
> +      emit_move_insn (dest_hi, const0_rtx);
> +      DONE;
> +    }
> +
> +  /* For settomg a VSX register from another VSX register, clear the result
> +     register, and use XXPERMDI to shift the value into the lower 64-bits.  */

setting

No reference to xxpermdi in the code chunk here, though we are pretty
sure it will be generated via gen_vsx_concat_v2di.  


> +  rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno);
> +  rtx dest_di = gen_rtx_REG (DImode, dest_regno);
> +
> +  emit_move_insn (dest_v2di, CONST0_RTX (V2DImode));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src));
> +  else
> +    emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di));
> +  DONE;
> +}
> +  [(set_attr "type"   "integer,mfvsr,vecmove,vecperm")
> +   (set_attr "length" "8,      8,    *,      8")])
> +
>  ;; Sign extend DI to TI.  We provide both GPR targets and Altivec targets on
>  ;; power10.  On earlier systems, the machine independent code will generate a
>  ;; shift left to sign extend the 64-bit value to 128-bit.


> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> index c40e1a3a0f7..1f1281d6b75 100644
> --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> @@ -6,33 +6,25 @@
>  /* { dg-do compile { target { ! power10_hw } } } */
>  /* { dg-require-effective-target power10_ok } */
>  /* { dg-require-effective-target int128 } */
> -
> -/* Deliberately set optization to zero for this test to confirm
> -   the lxvr*x instruction is generated. At higher optimization levels
> -   the instruction we are looking for is sometimes replaced by other
> -   load instructions. */
> -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
> -
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
>  /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */
> 
>  #define NUM_VEC_ELEMS 4
>  #define ITERS 16
> 
> -/*
> -Codegen at time of writing is a single lxvrwx for the zero
> -extended test, and a lwax,mtvsrdd,vextsd2q for the sign
> -extended test.
> -
> -0000000010000c90 <test_sign_extended_load>:
> -    10000c90:	aa 1a 24 7d 	lwax    r9,r4,r3
> -    10000c94:	67 4b 40 7c 	mtvsrdd vs34,0,r9
> -    10000c98:	02 16 5b 10 	vextsd2q v2,v2
> -    10000c9c:	20 00 80 4e 	blr
> -
> -0000000010000cb0 <test_zero_extended_unsigned_load>:
> -    10000cb0:	9b 18 44 7c 	lxvrwx  vs34,r4,r3
> -    10000cb4:	20 00 80 4e 	blr
> -*/
> +/* Codegen at time of writing is a single lxvrwx for the zero extended test,
> +   and a lxvrwx + vexts* sign extension instructions for the sign extended
> +   test.
> +
> +   0000000000000000 <test_sign_extended_load>:
> +	0:   9b 18 44 7c     lxvrwx  vs34,r4,r3
> +	4:   02 16 5a 10     vextsw2d v2,v2
> +	8:   02 16 5b 10     vextsd2q v2,v2
> +	c:   20 00 80 4e     blr
> +
> +   0000000000000020 <test_zero_extended_unsigned_load>:
> +	20:   9b 18 44 7c     lxvrwx  vs34,r4,r3
> +	24:   20 00 80 4e     blr  */

ok.

> 
>  #include <altivec.h>
>  #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> index 837ba79c9ab..a7721318812 100644
> --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> @@ -6,33 +6,26 @@
>  /* { dg-do compile { target { ! power10_hw } } } */
>  /* { dg-require-effective-target power10_ok } */
>  /* { dg-require-effective-target int128 } */
> -
> -/* Deliberately set optization to zero for this test to confirm
> -   the lxvr*x instruction is generated. At higher optimization levels
> -   the instruction we are looking for is sometimes replaced by other
> -   load instructions.  */
> -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
> 
>  /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */
> 
>  #define NUM_VEC_ELEMS 8
>  #define ITERS 16
> 
> -/*
> -Codegen at time of writing uses lxvrhx for the zero
> -extension test and lhax,mtvsrdd,vextsd2q for the
> -sign extended test.
> -
> -0000000010001810 <test_sign_extended_load>:
> -    10001810:	ae 1a 24 7d 	lhax    r9,r4,r3
> -    10001814:	67 4b 40 7c 	mtvsrdd vs34,0,r9
> -    10001818:	02 16 5b 10 	vextsd2q v2,v2
> -    1000181c:	20 00 80 4e 	blr
> -
> -0000000010001830 <test_zero_extended_unsigned_load>:
> -    10001830:	5b 18 44 7c 	lxvrhx  vs34,r4,r3
> -    10001834:	20 00 80 4e 	blr
> -*/
> +/* Codegen at time of writing is a single lxvrwx for the zero extended test,
> +   and a lxvrwx + vexts* sign extension instructions for the sign extended
> +   test.
> +
> +	0000000000000000 <test_sign_extended_load>:
> +	   0:   5b 18 44 7c     lxvrhx  vs34,r4,r3
> +	   4:   02 16 59 10     vextsh2d v2,v2
> +	   8:   02 16 5b 10     vextsd2q v2,v2
> +	   c:   20 00 80 4e     blr
> +
> +	0000000000000020 <test_zero_extended_unsigned_load>:
> +	  20:   5b 18 44 7c     lxvrhx  vs34,r4,r3
> +	  24:   20 00 80 4e     blr  */
> 

ok

>  #include <altivec.h>
>  #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> new file mode 100644
> index 00000000000..9b3b9c4dbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> @@ -0,0 +1,62 @@
> +/* { dg-require-effective-target int128     } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +/* This patch makes sure the various optimization and code paths are done for
> +   zero extending DImode to TImode on power10.  */
> +
> +__uint128_t
> +gpr_to_gpr (unsigned long long a)
> +{
> +  /* li 4,0.  */
> +  return a;
> +}
> +
> +__uint128_t
> +mem_to_gpr (unsigned long long *p)
> +{
> +  /* ld 3,0(3); li 4,0.  */
> +  return *p;
> +}
> +
> +__uint128_t
> +vsx_to_gpr (__uint128_t *p, double d)
> +{
> +  /* fctiduz 1,1; li 4,0;mfvsrd 3,1.  */
> +  return (unsigned long long)d;
> +}
> +
> +void
> +gpr_to_vsx (__uint128_t *p, unsigned long long a)
> +{
> +  /* mtvsrdd 0,0,4; stxv 0,0(3).  */
> +  __uint128_t b = a;
> +  __asm__ (" # %x0" : "+wa" (b));
> +  *p = b;
> +}
> +
> +void
> +mem_to_vsx (__uint128_t *p, unsigned long long *q)
> +{
> +  /* lxvrdx 0,0,4; stxv 0,0(3).  */
> +  __uint128_t a = *q;
> +  __asm__ (" # %x0" : "+wa" (a));
> +  *p = a;
> +}
> +
> +void
> +vsx_to_vsx (__uint128_t *p, double d)
> +{
> +  /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3).  */
> +  __uint128_t a = (unsigned long long)d;
> +  __asm__ (" # %x0" : "+wa" (a));
> +  *p = a;
> +}

ok

thanks
-Will


> +
> +/* { dg-final { scan-assembler-times {\mli\M}       3 } } */
> +/* { dg-final { scan-assembler-times {\mld\M}       1 } } */
> +/* { dg-final { scan-assembler-times {\mlxvrdx\M}   1 } } */
> +/* { dg-final { scan-assembler-times {\mmfvsrd\M}   1 } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrdd\M}  1 } } */
> +/* { dg-final { scan-assembler-times {\mstxv\M}     3 } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */
> -- 
> 2.35.1
> 
>
  
Michael Meissner April 8, 2022, 9:08 p.m. UTC | #2
On Wed, Apr 06, 2022 at 03:01:33PM -0500, will schmidt wrote:
> In this context it's not clear what is the "old code" ?
> The mtvsrdd
> instruction is referenced in this code path.  I see no direct reference
> to lxvrdx here, though I suppose it's assumed somewhere behind the
> emit_ calls.

The lxvrdx comes from this pattern:

(define_insn "vsx_lxvr<wd>x"
  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
       (zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "Z")))]
  "TARGET_POWER10"
  "lxvr<wd>x %x0,%y1"
  [(set_attr "type" "vecload")])

However since we don't currently provide a zero_extendditi2 insn, the compiler
won't generate the normal operation.  Instead, it will use the machine
independent code to copy the bottom bits, and zero the top bits as separate
insns.  By providing the pattern below, it opens up the possibility for
generating the zero_extend insn.

However, if you don't provide support for doing the GPR support, it will force
everything into the vector registers, which mean adding direct moves to get the
values into the appropriate registers.

> 
> > +(define_insn_and_split "zero_extendditi2"
> > +  [(set (match_operand:TI 0 "register_operand"                 "=r,r, wa,&wa")
> > +	(zero_extend:TI (match_operand:DI 1 "register_operand"  "r,wa,r,  wa")))]
> > +  "TARGET_POWERPC64 && TARGET_P9_VECTOR"
> > +  "@
> > +   #
> > +   #
> > +   mtvsrdd %x0,0,%1
> > +   #"
> > +  "&& reload_completed
> > +   && (int_reg_operand (operands[0], TImode)
> > +       || vsx_register_operand (operands[1], DImode))"
> > +  [(pc)]
> > +{
> > +  rtx dest = operands[0];
> > +  rtx src = operands[1];
> > +  int dest_regno = reg_or_subregno (dest);
> > +
> > +  /* Handle conversion to GPR registers.  Load up the low part and then do
> > +     zero out the upper part.  */
> > +  if (INT_REGNO_P (dest_regno))
> > +    {
> > +      rtx dest_hi = gen_highpart (DImode, dest);
> > +      rtx dest_lo = gen_lowpart (DImode, dest);
> > +
> > +      emit_move_insn (dest_lo, src);
> > +      emit_move_insn (dest_hi, const0_rtx);
> > +      DONE;
> > +    }
> > +
> > +  /* For settomg a VSX register from another VSX register, clear the result
> > +     register, and use XXPERMDI to shift the value into the lower 64-bits.  */
> 
> setting
> 
> No reference to xxpermdi in the code chunk here, though we are pretty
> sure it will be generated via gen_vsx_concat_v2di.

Yes.

For example, on power9 for:

	void
	gpr_to_vsx (__uint128_t *p, unsigned long long a)
	{
	  /* mtvsrdd 0,0,4; stxv 0,0(3).  */
	  __uint128_t b = a;
	  __asm__ (" # %x0" : "+wa" (b));
	  *p = b;
	}

	void
	vsx_to_vsx (__uint128_t *p, double d)
	{
	  /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3).  */
	  __uint128_t a = (unsigned long long)d;
	  __asm__ (" # %x0" : "+wa" (a));
	  *p = a;
	}

The code generated is:

	old gpr_to_vsx		new gpr_to_vsx
	==============		==============
	mr 10,4			mtvsrdd 0,0,4
	li 11,0
	mtvsrdd 0,11,10

	old vsx_to_vsx		new new vsx_to_vsx
	==============		==================
	fctiduz 0,1		fctiduz 1,1
	li 11,0			xxsplitib 0,0
	mfvsrd 10,0		xxpermdi 0,0,1,0
	mtvsrdd 0,11,10

In addition, on power10 for:

	void
	mem_to_vsx (__uint128_t *p, unsigned long long *q)
	{
	  /* lxvrdx 0,0,4; stxv 0,0(3).  */
	  __uint128_t a = *q;
	  __asm__ (" # %x0" : "+wa" (a));
	  *p = a;
	}

The code generated is:

	old mem_to_vsx		new mem_to_vsx
	==============		==============
	ld 10,0(4)		lxvrdx 0,0,4
	li 11,0
	mtvsrdd 0,11,10
  
Michael Meissner April 20, 2022, 4:05 p.m. UTC | #3
Ping patch.

| Date: Wed, 6 Apr 2022 14:21:26 -0400
| From: Michael Meissner <meissner@linux.ibm.com>
| Subject: [PATCH] Add zero_extendditi2.  Improve lxvr*x code generation.
  

Patch

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index c091e5e2f47..ad971e3a1de 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1315,14 +1315,32 @@  (define_expand "vsx_store_<mode>"
     }
 })
 
-;; Load rightmost element from load_data
-;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
-(define_insn "vsx_lxvr<wd>x"
-  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
-	(zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "Z")))]
-  "TARGET_POWER10"
-  "lxvr<wd>x %x0,%y1"
-  [(set_attr "type" "vecload")])
+;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
+;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x
+;; and then two direct moves if we ultimately need the value in a GPR register.
+(define_insn_and_split "vsx_lxvr<wd>x"
+  [(set (match_operand:TI 0 "register_operand" "=r,wa")
+	(zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "m,Z")))]
+  "TARGET_POWERPC64 && TARGET_POWER10"
+  "@
+   #
+   lxvr<wd>x %x0,%y1"
+  "&& reload_completed && int_reg_operand (operands[0], TImode)"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (const_int 0))]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  operands[2] = gen_lowpart (DImode, op0);
+  operands[3] = (<MODE>mode == DImode
+		 ? op1
+		 : gen_rtx_ZERO_EXTEND (DImode, op1));
+
+  operands[4] = gen_highpart (DImode, op0);
+}
+  [(set_attr "type" "load,vecload")
+   (set_attr "num_insns" "2,*")])
 
 ;; Store rightmost element into store_data
 ;; using stxvrbx, stxvrhx, strvxwx, strvxdx.
@@ -5019,6 +5037,54 @@  (define_expand "vsignextend_si_v2di"
   DONE;
 })
 
+;; Zero extend DI to TI.  If we don't have the MTVSRDD instruction (and LXVRDX
+;; in the case of power10), we use the machine independent code.  If we are
+;; loading up GPRs, we fall back to the old code.
+(define_insn_and_split "zero_extendditi2"
+  [(set (match_operand:TI 0 "register_operand"                 "=r,r, wa,&wa")
+	(zero_extend:TI (match_operand:DI 1 "register_operand"  "r,wa,r,  wa")))]
+  "TARGET_POWERPC64 && TARGET_P9_VECTOR"
+  "@
+   #
+   #
+   mtvsrdd %x0,0,%1
+   #"
+  "&& reload_completed
+   && (int_reg_operand (operands[0], TImode)
+       || vsx_register_operand (operands[1], DImode))"
+  [(pc)]
+{
+  rtx dest = operands[0];
+  rtx src = operands[1];
+  int dest_regno = reg_or_subregno (dest);
+
+  /* Handle conversion to GPR registers.  Load up the low part and then do
+     zero out the upper part.  */
+  if (INT_REGNO_P (dest_regno))
+    {
+      rtx dest_hi = gen_highpart (DImode, dest);
+      rtx dest_lo = gen_lowpart (DImode, dest);
+
+      emit_move_insn (dest_lo, src);
+      emit_move_insn (dest_hi, const0_rtx);
+      DONE;
+    }
+
+  /* For settomg a VSX register from another VSX register, clear the result
+     register, and use XXPERMDI to shift the value into the lower 64-bits.  */
+  rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno);
+  rtx dest_di = gen_rtx_REG (DImode, dest_regno);
+
+  emit_move_insn (dest_v2di, CONST0_RTX (V2DImode));
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src));
+  else
+    emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di));
+  DONE;
+}
+  [(set_attr "type"   "integer,mfvsr,vecmove,vecperm")
+   (set_attr "length" "8,      8,    *,      8")])
+
 ;; Sign extend DI to TI.  We provide both GPR targets and Altivec targets on
 ;; power10.  On earlier systems, the machine independent code will generate a
 ;; shift left to sign extend the 64-bit value to 128-bit.
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
index c40e1a3a0f7..1f1281d6b75 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
@@ -6,33 +6,25 @@ 
 /* { dg-do compile { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
 /* { dg-require-effective-target int128 } */
-
-/* Deliberately set optization to zero for this test to confirm
-   the lxvr*x instruction is generated. At higher optimization levels
-   the instruction we are looking for is sometimes replaced by other
-   load instructions. */
-/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
-
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
 /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */
 
 #define NUM_VEC_ELEMS 4
 #define ITERS 16
 
-/*
-Codegen at time of writing is a single lxvrwx for the zero
-extended test, and a lwax,mtvsrdd,vextsd2q for the sign
-extended test.
-
-0000000010000c90 <test_sign_extended_load>:
-    10000c90:	aa 1a 24 7d 	lwax    r9,r4,r3
-    10000c94:	67 4b 40 7c 	mtvsrdd vs34,0,r9
-    10000c98:	02 16 5b 10 	vextsd2q v2,v2
-    10000c9c:	20 00 80 4e 	blr
-
-0000000010000cb0 <test_zero_extended_unsigned_load>:
-    10000cb0:	9b 18 44 7c 	lxvrwx  vs34,r4,r3
-    10000cb4:	20 00 80 4e 	blr
-*/
+/* Codegen at time of writing is a single lxvrwx for the zero extended test,
+   and a lxvrwx + vexts* sign extension instructions for the sign extended
+   test.
+
+   0000000000000000 <test_sign_extended_load>:
+	0:   9b 18 44 7c     lxvrwx  vs34,r4,r3
+	4:   02 16 5a 10     vextsw2d v2,v2
+	8:   02 16 5b 10     vextsd2q v2,v2
+	c:   20 00 80 4e     blr
+
+   0000000000000020 <test_zero_extended_unsigned_load>:
+	20:   9b 18 44 7c     lxvrwx  vs34,r4,r3
+	24:   20 00 80 4e     blr  */
 
 #include <altivec.h>
 #include <stdio.h>
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
index 837ba79c9ab..a7721318812 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
@@ -6,33 +6,26 @@ 
 /* { dg-do compile { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
 /* { dg-require-effective-target int128 } */
-
-/* Deliberately set optization to zero for this test to confirm
-   the lxvr*x instruction is generated. At higher optimization levels
-   the instruction we are looking for is sometimes replaced by other
-   load instructions.  */
-/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
 
 /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */
 
 #define NUM_VEC_ELEMS 8
 #define ITERS 16
 
-/*
-Codegen at time of writing uses lxvrhx for the zero
-extension test and lhax,mtvsrdd,vextsd2q for the
-sign extended test.
-
-0000000010001810 <test_sign_extended_load>:
-    10001810:	ae 1a 24 7d 	lhax    r9,r4,r3
-    10001814:	67 4b 40 7c 	mtvsrdd vs34,0,r9
-    10001818:	02 16 5b 10 	vextsd2q v2,v2
-    1000181c:	20 00 80 4e 	blr
-
-0000000010001830 <test_zero_extended_unsigned_load>:
-    10001830:	5b 18 44 7c 	lxvrhx  vs34,r4,r3
-    10001834:	20 00 80 4e 	blr
-*/
+/* Codegen at time of writing is a single lxvrwx for the zero extended test,
+   and a lxvrwx + vexts* sign extension instructions for the sign extended
+   test.
+
+	0000000000000000 <test_sign_extended_load>:
+	   0:   5b 18 44 7c     lxvrhx  vs34,r4,r3
+	   4:   02 16 59 10     vextsh2d v2,v2
+	   8:   02 16 5b 10     vextsd2q v2,v2
+	   c:   20 00 80 4e     blr
+
+	0000000000000020 <test_zero_extended_unsigned_load>:
+	  20:   5b 18 44 7c     lxvrhx  vs34,r4,r3
+	  24:   20 00 80 4e     blr  */
 
 #include <altivec.h>
 #include <stdio.h>
diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
new file mode 100644
index 00000000000..9b3b9c4dbd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
@@ -0,0 +1,62 @@ 
+/* { dg-require-effective-target int128     } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* This patch makes sure the various optimization and code paths are done for
+   zero extending DImode to TImode on power10.  */
+
+__uint128_t
+gpr_to_gpr (unsigned long long a)
+{
+  /* li 4,0.  */
+  return a;
+}
+
+__uint128_t
+mem_to_gpr (unsigned long long *p)
+{
+  /* ld 3,0(3); li 4,0.  */
+  return *p;
+}
+
+__uint128_t
+vsx_to_gpr (__uint128_t *p, double d)
+{
+  /* fctiduz 1,1; li 4,0;mfvsrd 3,1.  */
+  return (unsigned long long)d;
+}
+
+void
+gpr_to_vsx (__uint128_t *p, unsigned long long a)
+{
+  /* mtvsrdd 0,0,4; stxv 0,0(3).  */
+  __uint128_t b = a;
+  __asm__ (" # %x0" : "+wa" (b));
+  *p = b;
+}
+
+void
+mem_to_vsx (__uint128_t *p, unsigned long long *q)
+{
+  /* lxvrdx 0,0,4; stxv 0,0(3).  */
+  __uint128_t a = *q;
+  __asm__ (" # %x0" : "+wa" (a));
+  *p = a;
+}
+
+void
+vsx_to_vsx (__uint128_t *p, double d)
+{
+  /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3).  */
+  __uint128_t a = (unsigned long long)d;
+  __asm__ (" # %x0" : "+wa" (a));
+  *p = a;
+}
+
+/* { dg-final { scan-assembler-times {\mli\M}       3 } } */
+/* { dg-final { scan-assembler-times {\mld\M}       1 } } */
+/* { dg-final { scan-assembler-times {\mlxvrdx\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mmfvsrd\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M}  1 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M}     3 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */