[V2] rs6000: load high and low part of 128bit vector independently [PR110040]

Message ID 18dfc483-0815-4ede-aa20-0833a83d9671@linux.ibm.com
State New
Headers
Series [V2] rs6000: load high and low part of 128bit vector independently [PR110040] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed

Commit Message

jeevitha June 19, 2024, 12:39 p.m. UTC
  Hi All,

Updated the patch based on review comments. This patch passed bootstrap
and regression testing on powerpc64le-linux with no regressions.

PR110040 exposes an issue concerning moves from vector registers to GPRs.
There are two moves, one for upper 64 bits and the other for the lower
64 bits.  In the problematic test case, we are only interested in storing
the lower 64 bits.  However, the instruction for copying the upper 64 bits
is still emitted and is dead code.  This patch adds a splitter that splits
apart the two move instructions so that DCE can remove the dead code after
splitting.

2024-06-19  Jeevitha Palanisamy  <jeevitha@linux.ibm.com>

gcc/
	PR target/110040
	* config/rs6000/vsx.md (split pattern for V1TI to DI move): Defined.

gcc/testsuite/
	PR target/110040
	* gcc.target/powerpc/pr110040-1.c: New testcase.
	* gcc.target/powerpc/pr110040-2.c: New testcase.
  

Comments

Kewen.Lin July 2, 2024, 8:08 a.m. UTC | #1
Hi Jeevitha,

on 2024/6/19 20:39, jeevitha wrote:
> Hi All,
> 
> Updated the patch based on review comments. This patch passed bootstrap
> and regression testing on powerpc64le-linux with no regressions.
> 
> PR110040 exposes an issue concerning moves from vector registers to GPRs.
> There are two moves, one for upper 64 bits and the other for the lower
> 64 bits.  In the problematic test case, we are only interested in storing
> the lower 64 bits.  However, the instruction for copying the upper 64 bits
> is still emitted and is dead code.  This patch adds a splitter that splits
> apart the two move instructions so that DCE can remove the dead code after
> splitting.
> 
> 2024-06-19  Jeevitha Palanisamy  <jeevitha@linux.ibm.com>
> 
> gcc/
> 	PR target/110040
> 	* config/rs6000/vsx.md (split pattern for V1TI to DI move): Defined.

Nit: s/Defined/New define/

> 
> gcc/testsuite/
> 	PR target/110040
> 	* gcc.target/powerpc/pr110040-1.c: New testcase.
> 	* gcc.target/powerpc/pr110040-2.c: New testcase.
> 
> 
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index f135fa079bd..f1979815df6 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -6706,3 +6706,20 @@
>    "vmsumcud %0,%1,%2,%3"
>    [(set_attr "type" "veccomplex")]
>  )
> +
> +(define_split
> +  [(set (match_operand:V1TI 0 "gpc_reg_operand")
> +       (match_operand:V1TI 1 "vsx_register_operand"))]
> +  "reload_completed
> +   && TARGET_DIRECT_MOVE_64BIT
> +   && int_reg_operand (operands[0], V1TImode)
> +   && vsx_register_operand (operands[1], V1TImode)"
> +   [(pc)]
> +{
> +  rtx src_op = gen_rtx_REG (V2DImode, REGNO (operands[1]));
> +  rtx dest_op0 = gen_rtx_REG (DImode, REGNO (operands[0]));
> +  rtx dest_op1 = gen_rtx_REG (DImode, REGNO (operands[0]) + 1);
> +  emit_insn (gen_vsx_extract_v2di (dest_op0, src_op, const0_rtx));
> +  emit_insn (gen_vsx_extract_v2di (dest_op1, src_op, const1_rtx));
> +  DONE;
> +})
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr110040-1.c b/gcc/testsuite/gcc.target/powerpc/pr110040-1.c
> new file mode 100644
> index 00000000000..0a521e9e51d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr110040-1.c
> @@ -0,0 +1,15 @@
> +/* PR target/110040 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target int128 } */
> +/* { dg-require-effective-target powerpc_vsx } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> +/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
> +
> +#include <altivec.h>
> +
> +void
> +foo (signed long *dst, vector signed __int128 src)
> +{
> +  *dst = (signed long) src[0];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr110040-2.c b/gcc/testsuite/gcc.target/powerpc/pr110040-2.c
> new file mode 100644
> index 00000000000..d2ef471d666
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr110040-2.c
> @@ -0,0 +1,16 @@
> +/* PR target/110040 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
> +/* { dg-require-effective-target int128 } */
> +/* { dg-require-effective-target powerpc_vsx } */
> +/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
> +
> +/* Note: __builtin_altivec_tr_stxvrwx requires the -mcpu=power10 option */

It's a bit obscure here that the used bif name is "vec_xst_trunc" but this comment
says "__builtin_altivec_tr_stxvrwx", I can understand it's copied from the actual
error message but it also mentions "vec_xst_trunc" there when erroring.

Maybe just say "/* builtin vec_xst_trunc requires power10.  */" to avoid any
confusion here.

OK for trunk with these nits tweaked, thanks!

BR,
Kewen


> +
> +#include <altivec.h>
> +
> +void
> +foo (signed int *dst, vector signed __int128 src)
> +{
> +  __builtin_vec_xst_trunc (src, 0, dst);
> +}
> 
>
  

Patch

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index f135fa079bd..f1979815df6 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -6706,3 +6706,20 @@ 
   "vmsumcud %0,%1,%2,%3"
   [(set_attr "type" "veccomplex")]
 )
+
+(define_split
+  [(set (match_operand:V1TI 0 "gpc_reg_operand")
+       (match_operand:V1TI 1 "vsx_register_operand"))]
+  "reload_completed
+   && TARGET_DIRECT_MOVE_64BIT
+   && int_reg_operand (operands[0], V1TImode)
+   && vsx_register_operand (operands[1], V1TImode)"
+   [(pc)]
+{
+  rtx src_op = gen_rtx_REG (V2DImode, REGNO (operands[1]));
+  rtx dest_op0 = gen_rtx_REG (DImode, REGNO (operands[0]));
+  rtx dest_op1 = gen_rtx_REG (DImode, REGNO (operands[0]) + 1);
+  emit_insn (gen_vsx_extract_v2di (dest_op0, src_op, const0_rtx));
+  emit_insn (gen_vsx_extract_v2di (dest_op1, src_op, const1_rtx));
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.target/powerpc/pr110040-1.c b/gcc/testsuite/gcc.target/powerpc/pr110040-1.c
new file mode 100644
index 00000000000..0a521e9e51d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr110040-1.c
@@ -0,0 +1,15 @@ 
+/* PR target/110040 */
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target powerpc_vsx } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
+
+#include <altivec.h>
+
+void
+foo (signed long *dst, vector signed __int128 src)
+{
+  *dst = (signed long) src[0];
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr110040-2.c b/gcc/testsuite/gcc.target/powerpc/pr110040-2.c
new file mode 100644
index 00000000000..d2ef471d666
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr110040-2.c
@@ -0,0 +1,16 @@ 
+/* PR target/110040 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target powerpc_vsx } */
+/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
+
+/* Note: __builtin_altivec_tr_stxvrwx requires the -mcpu=power10 option */
+
+#include <altivec.h>
+
+void
+foo (signed int *dst, vector signed __int128 src)
+{
+  __builtin_vec_xst_trunc (src, 0, dst);
+}