x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Testing passed
|
Commit Message
PR target/107563
gcc/ChangeLog:
* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): New Entry.
gcc/testsuite/ChangeLog:
* g++.target/i386/pr107563.C: New test.
---
gcc/config/i386/i386-expand.cc | 64 ++++++++++++++++++++++++
gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
2 files changed, 87 insertions(+)
create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C
Comments
On Wed, May 8, 2024 at 4:44 AM Levy Hsu <admin@levyhsu.com> wrote:
>
> PR target/107563
>
> gcc/ChangeLog:
>
> * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
> subroutine.
> (ix86_expand_vec_perm_const_1): New Entry.
>
> gcc/testsuite/ChangeLog:
>
> * g++.target/i386/pr107563.C: New test.
> ---
> gcc/config/i386/i386-expand.cc | 64 ++++++++++++++++++++++++
> gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
> 2 files changed, 87 insertions(+)
> create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..2718b0acb87 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
> return true;
> }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1.
> + Implement a permutation with psrlw, psllw and por.
> + It handles case:
> + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
> +
> +static bool
> +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
> +{
> + unsigned i;
> + rtx (*gen_shr) (rtx, rtx, rtx);
> + rtx (*gen_shl) (rtx, rtx, rtx);
> + rtx (*gen_or) (rtx, rtx, rtx);
> + machine_mode mode = VOIDmode;
> +
> + if (!TARGET_SSE2 || !d->one_operand_p)
> + return false;
> +
> + switch (d->vmode)
> + {
> + case E_V8QImode:
> + if (!TARGET_MMX_WITH_SSE)
> + return false;
> + mode = V4HImode;
> + gen_shr = gen_ashrv4hi3;
> + gen_shl = gen_ashlv4hi3;
> + gen_or = gen_iorv4hi3;
> + break;
> + case E_V16QImode:
> + mode = V8HImode;
> + gen_shr = gen_vlshrv8hi3;
> + gen_shl = gen_vashlv8hi3;
> + gen_or = gen_iorv8hi3;
> + break;
> + default: return false;
> + }
> +
> + if (!rtx_equal_p (d->op0, d->op1))
> + return false;
> +
> + for (i = 0; i < d->nelt; i += 2)
> + if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + rtx tmp1 = gen_reg_rtx (mode);
> + rtx tmp2 = gen_reg_rtx (mode);
> + rtx op0 = force_reg (d->vmode, d->op0);
> +
> + emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
> + emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
> + emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
> + emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
> + emit_insn (gen_or (tmp1, tmp1, tmp2));
> + emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
> +
> + return true;
> +}
> +
> /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
> permutation using two vperm2f128, followed by a vshufpd insn blending
> the two vectors together. */
> @@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>
> if (expand_vec_perm_2perm_pblendv (d, false))
> return true;
> +
> + if (expand_vec_perm_psrlw_psllw_por (d))
> + return true;
>
> /* Try sequences of four instructions. */
>
> diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C
> new file mode 100755
> index 00000000000..5b0c648e8f1
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563.C
> @@ -0,0 +1,23 @@
> +/* PR target/107563.C */
> +/* { dg-do compile { target { ! ia32 } } } */
Please split the testcase to two files, one (e.g. pr107563-a.C)
testing 8-byte vectors and the other (e.g. pr107563-b.C) using 16-byte
vectors. The latter can also be tested with 32-bit targets.
Uros.
> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-not "movzbl" } } */
> +/* { dg-final { scan-assembler-not "salq" } } */
> +/* { dg-final { scan-assembler-not "orq" } } */
> +/* { dg-final { scan-assembler-not "punpcklqdq" } } */
> +/* { dg-final { scan-assembler-times "psllw" 2 } } */
> +/* { dg-final { scan-assembler-times "psrlw" 1 } } */
> +/* { dg-final { scan-assembler-times "psraw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 2 } } */
> +
> +using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
> +void foo (temp_vec_type& v) noexcept
> +{
> + v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +}
> +
> +using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
> +void foo2 (temp_vec_type2& v) noexcept
> +{
> + v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
> +}
> --
> 2.31.1
>
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_const_1.
+ Implement a permutation with psrlw, psllw and por.
+ It handles case:
+ __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+ __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+ unsigned i;
+ rtx (*gen_shr) (rtx, rtx, rtx);
+ rtx (*gen_shl) (rtx, rtx, rtx);
+ rtx (*gen_or) (rtx, rtx, rtx);
+ machine_mode mode = VOIDmode;
+
+ if (!TARGET_SSE2 || !d->one_operand_p)
+ return false;
+
+ switch (d->vmode)
+ {
+ case E_V8QImode:
+ if (!TARGET_MMX_WITH_SSE)
+ return false;
+ mode = V4HImode;
+ gen_shr = gen_ashrv4hi3;
+ gen_shl = gen_ashlv4hi3;
+ gen_or = gen_iorv4hi3;
+ break;
+ case E_V16QImode:
+ mode = V8HImode;
+ gen_shr = gen_vlshrv8hi3;
+ gen_shl = gen_vashlv8hi3;
+ gen_or = gen_iorv8hi3;
+ break;
+ default: return false;
+ }
+
+ if (!rtx_equal_p (d->op0, d->op1))
+ return false;
+
+ for (i = 0; i < d->nelt; i += 2)
+ if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ rtx tmp1 = gen_reg_rtx (mode);
+ rtx tmp2 = gen_reg_rtx (mode);
+ rtx op0 = force_reg (d->vmode, d->op0);
+
+ emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+ emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+ emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+ emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+ emit_insn (gen_or (tmp1, tmp1, tmp2));
+ emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together. */
@@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_2perm_pblendv (d, false))
return true;
+
+ if (expand_vec_perm_psrlw_psllw_por (d))
+ return true;
/* Try sequences of four instructions. */
new file mode 100755
@@ -0,0 +1,23 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+ v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+ v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}