rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]
Commit Message
If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.
gcc/ChangeLog:
* config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
patterns match and emit for VSX xxpermdi.
gcc/testsuite/ChangeLog:
* gcc.target/powerpc/pr102868.c: New test.
---
gcc/config/rs6000/rs6000.c | 47 ++++++++++++++++--
gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
2 files changed, 97 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c
Comments
On Sun, Oct 24, 2021 at 10:51 PM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
> If the second operand of __builtin_shuffle is const vector 0, and with
> specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.
>
> gcc/ChangeLog:
>
> * config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
> patterns match and emit for VSX xxpermdi.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/powerpc/pr102868.c: New test.
> ---
> gcc/config/rs6000/rs6000.c | 47 ++++++++++++++++--
> gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
> 2 files changed, 97 insertions(+), 3 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index d0730253bcc..5d802c1fa96 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -23046,7 +23046,23 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
> {OPTION_MASK_P8_VECTOR,
> BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
> : CODE_FOR_p8_vmrgew_v4sf_direct,
> - {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
> + {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
> + {OPTION_MASK_VSX,
> + (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> + : CODE_FOR_vsx_xxpermdi_v16qi),
> + {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
> + {OPTION_MASK_VSX,
> + (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> + : CODE_FOR_vsx_xxpermdi_v16qi),
> + {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
> + {OPTION_MASK_VSX,
> + (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> + : CODE_FOR_vsx_xxpermdi_v16qi),
> + {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
> + {OPTION_MASK_VSX,
> + (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> + : CODE_FOR_vsx_xxpermdi_v16qi),
> + {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};
If the insn_code is the same for big endian and little endian, why
does the new code test BYTES_BIG_ENDIAN to set the same value
(CODE_FOR_vsx_xxpermdi_v16qi)?
Thanks, David
>
> unsigned int i, j, elt, which;
> unsigned char perm[16];
> @@ -23169,6 +23185,27 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
> machine_mode omode = insn_data[icode].operand[0].mode;
> machine_mode imode = insn_data[icode].operand[1].mode;
>
> + rtx perm_idx = GEN_INT (0);
> + if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
> + {
> + int perm_val = 0;
> + if (one_vec)
> + {
> + if (perm[0] == 8)
> + perm_val |= 2;
> + if (perm[8] == 8)
> + perm_val |= 1;
> + }
> + else
> + {
> + if (perm[0] != 0)
> + perm_val |= 2;
> + if (perm[8] != 16)
> + perm_val |= 1;
> + }
> + perm_idx = GEN_INT (perm_val);
> + }
> +
> /* For little-endian, don't use vpkuwum and vpkuhum if the
> underlying vector type is not V4SI and V8HI, respectively.
> For example, using vpkuwum with a V8HI picks up the even
> @@ -23192,7 +23229,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
> /* For little-endian, the two input operands must be swapped
> (or swapped back) to ensure proper right-to-left numbering
> from 0 to 2N-1. */
> - if (swapped ^ !BYTES_BIG_ENDIAN)
> + if (swapped ^ !BYTES_BIG_ENDIAN
> + && icode != CODE_FOR_vsx_xxpermdi_v16qi)
> std::swap (op0, op1);
> if (imode != V16QImode)
> {
> @@ -23203,7 +23241,10 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
> x = target;
> else
> x = gen_reg_rtx (omode);
> - emit_insn (GEN_FCN (icode) (x, op0, op1));
> + if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
> + emit_insn (GEN_FCN (icode) (x, op0, op1, perm_idx));
> + else
> + emit_insn (GEN_FCN (icode) (x, op0, op1));
> if (omode != V16QImode)
> emit_move_insn (target, gen_lowpart (V16QImode, x));
> return true;
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr102868.c b/gcc/testsuite/gcc.target/powerpc/pr102868.c
> new file mode 100644
> index 00000000000..eb45d193f66
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr102868.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +/* { dg-options "-O2 -mvsx" } */
> +
> +#include <altivec.h>
> +vector float b = {0.0f, 0.0f, 0.0f, 0.0f};
> +
> +
> +vector float foo1 (vector float x)
> +{
> + vector int c = {0, 1, 4, 5};
> + return __builtin_shuffle (x, b, c);
> +}
> +
> +vector float foo2 (vector float x)
> +{
> + vector int c = {2, 3, 4, 5};
> + return __builtin_shuffle (x, b, c);
> +}
> +
> +vector float foo3 (vector float x)
> +{
> + vector int c = {0, 1, 6, 7};
> + return __builtin_shuffle (x, b, c);
> +}
> +
> +vector float foo4 (vector float x)
> +{
> + vector int c = {2, 3, 6, 7};
> + return __builtin_shuffle (x, b, c);
> +}
> +
> +vector unsigned char foo5 (vector unsigned char x)
> +{
> + vector unsigned char c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
> + return __builtin_shuffle (x, c);
> +}
> +
> +vector unsigned char foo6 (vector unsigned char x)
> +{
> + vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
> + return __builtin_shuffle (x, c);
> +}
> +
> +vector unsigned char foo7 (vector unsigned char x)
> +{
> + vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
> + return __builtin_shuffle (x, c);
> +}
> +
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target has_arch_pwr9 } } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target { {! has_arch_pwr9} && be } } } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 11 { target { {! has_arch_pwr9} && le } } } } */
> --
> 2.25.1
>
@@ -23046,7 +23046,23 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
{OPTION_MASK_P8_VECTOR,
BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
: CODE_FOR_p8_vmrgew_v4sf_direct,
- {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
+ {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
+ {OPTION_MASK_VSX,
+ (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+ : CODE_FOR_vsx_xxpermdi_v16qi),
+ {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
+ {OPTION_MASK_VSX,
+ (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+ : CODE_FOR_vsx_xxpermdi_v16qi),
+ {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+ {OPTION_MASK_VSX,
+ (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+ : CODE_FOR_vsx_xxpermdi_v16qi),
+ {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
+ {OPTION_MASK_VSX,
+ (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+ : CODE_FOR_vsx_xxpermdi_v16qi),
+ {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};
unsigned int i, j, elt, which;
unsigned char perm[16];
@@ -23169,6 +23185,27 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
machine_mode omode = insn_data[icode].operand[0].mode;
machine_mode imode = insn_data[icode].operand[1].mode;
+ rtx perm_idx = GEN_INT (0);
+ if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
+ {
+ int perm_val = 0;
+ if (one_vec)
+ {
+ if (perm[0] == 8)
+ perm_val |= 2;
+ if (perm[8] == 8)
+ perm_val |= 1;
+ }
+ else
+ {
+ if (perm[0] != 0)
+ perm_val |= 2;
+ if (perm[8] != 16)
+ perm_val |= 1;
+ }
+ perm_idx = GEN_INT (perm_val);
+ }
+
/* For little-endian, don't use vpkuwum and vpkuhum if the
underlying vector type is not V4SI and V8HI, respectively.
For example, using vpkuwum with a V8HI picks up the even
@@ -23192,7 +23229,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
/* For little-endian, the two input operands must be swapped
(or swapped back) to ensure proper right-to-left numbering
from 0 to 2N-1. */
- if (swapped ^ !BYTES_BIG_ENDIAN)
+ if (swapped ^ !BYTES_BIG_ENDIAN
+ && icode != CODE_FOR_vsx_xxpermdi_v16qi)
std::swap (op0, op1);
if (imode != V16QImode)
{
@@ -23203,7 +23241,10 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
x = target;
else
x = gen_reg_rtx (omode);
- emit_insn (GEN_FCN (icode) (x, op0, op1));
+ if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
+ emit_insn (GEN_FCN (icode) (x, op0, op1, perm_idx));
+ else
+ emit_insn (GEN_FCN (icode) (x, op0, op1));
if (omode != V16QImode)
emit_move_insn (target, gen_lowpart (V16QImode, x));
return true;
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+#include <altivec.h>
+vector float b = {0.0f, 0.0f, 0.0f, 0.0f};
+
+
+vector float foo1 (vector float x)
+{
+ vector int c = {0, 1, 4, 5};
+ return __builtin_shuffle (x, b, c);
+}
+
+vector float foo2 (vector float x)
+{
+ vector int c = {2, 3, 4, 5};
+ return __builtin_shuffle (x, b, c);
+}
+
+vector float foo3 (vector float x)
+{
+ vector int c = {0, 1, 6, 7};
+ return __builtin_shuffle (x, b, c);
+}
+
+vector float foo4 (vector float x)
+{
+ vector int c = {2, 3, 6, 7};
+ return __builtin_shuffle (x, b, c);
+}
+
+vector unsigned char foo5 (vector unsigned char x)
+{
+ vector unsigned char c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+ return __builtin_shuffle (x, c);
+}
+
+vector unsigned char foo6 (vector unsigned char x)
+{
+ vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
+ return __builtin_shuffle (x, c);
+}
+
+vector unsigned char foo7 (vector unsigned char x)
+{
+ vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+ return __builtin_shuffle (x, c);
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target has_arch_pwr9 } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target { {! has_arch_pwr9} && be } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 11 { target { {! has_arch_pwr9} && le } } } } */