x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

Message ID 20240508024205.3623179-1-admin@levyhsu.com
State New
Headers
Series x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed

Commit Message

Levy Hsu May 8, 2024, 2:42 a.m. UTC
  PR target/107563

gcc/ChangeLog:

	* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
	subroutine.
	(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

	* g++.target/i386/pr107563.C: New test.
---
 gcc/config/i386/i386-expand.cc           | 64 ++++++++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
 2 files changed, 87 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C
  

Comments

Uros Bizjak May 8, 2024, 7:07 a.m. UTC | #1
On Wed, May 8, 2024 at 4:44 AM Levy Hsu <admin@levyhsu.com> wrote:
>
>         PR target/107563
>
> gcc/ChangeLog:
>
>         * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
>         subroutine.
>         (ix86_expand_vec_perm_const_1): New Entry.
>
> gcc/testsuite/ChangeLog:
>
>         * g++.target/i386/pr107563.C: New test.
> ---
>  gcc/config/i386/i386-expand.cc           | 64 ++++++++++++++++++++++++
>  gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
>  2 files changed, 87 insertions(+)
>  create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..2718b0acb87 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
>    return true;
>  }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1.
> +   Implement a permutation with psrlw, psllw and por.
> +   It handles case:
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
> +
> +static bool
> +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
> +{
> +  unsigned i;
> +  rtx (*gen_shr) (rtx, rtx, rtx);
> +  rtx (*gen_shl) (rtx, rtx, rtx);
> +  rtx (*gen_or) (rtx, rtx, rtx);
> +  machine_mode mode = VOIDmode;
> +
> +  if (!TARGET_SSE2 || !d->one_operand_p)
> +    return false;
> +
> +  switch (d->vmode)
> +    {
> +    case E_V8QImode:
> +      if (!TARGET_MMX_WITH_SSE)
> +       return false;
> +      mode = V4HImode;
> +      gen_shr = gen_ashrv4hi3;
> +      gen_shl = gen_ashlv4hi3;
> +      gen_or = gen_iorv4hi3;
> +      break;
> +    case E_V16QImode:
> +      mode = V8HImode;
> +      gen_shr = gen_vlshrv8hi3;
> +      gen_shl = gen_vashlv8hi3;
> +      gen_or = gen_iorv8hi3;
> +      break;
> +    default: return false;
> +    }
> +
> +  if (!rtx_equal_p (d->op0, d->op1))
> +    return false;
> +
> +  for (i = 0; i < d->nelt; i += 2)
> +    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  rtx tmp1 = gen_reg_rtx (mode);
> +  rtx tmp2 = gen_reg_rtx (mode);
> +  rtx op0 = force_reg (d->vmode, d->op0);
> +
> +  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
> +  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
> +  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
> +  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
> +  emit_insn (gen_or (tmp1, tmp1, tmp2));
> +  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
> +
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
> @@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>
>    if (expand_vec_perm_2perm_pblendv (d, false))
>      return true;
> +
> +  if (expand_vec_perm_psrlw_psllw_por (d))
> +    return true;
>
>    /* Try sequences of four instructions.  */
>
> diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C
> new file mode 100755
> index 00000000000..5b0c648e8f1
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563.C
> @@ -0,0 +1,23 @@
> +/* PR target/107563.C */
> +/* { dg-do compile { target { ! ia32 } } } */

Please split the testcase to two files, one (e.g. pr107563-a.C)
testing 8-byte vectors and the other (e.g. pr107563-b.C) using 16-byte
vectors. The latter can also be tested with 32-bit targets.

Uros.

> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-not "movzbl" } } */
> +/* { dg-final { scan-assembler-not "salq" } } */
> +/* { dg-final { scan-assembler-not "orq" } } */
> +/* { dg-final { scan-assembler-not "punpcklqdq" } } */
> +/* { dg-final { scan-assembler-times "psllw" 2 } } */
> +/* { dg-final { scan-assembler-times "psrlw" 1 } } */
> +/* { dg-final { scan-assembler-times "psraw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 2 } } */
> +
> +using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
> +void foo (temp_vec_type& v) noexcept
> +{
> +  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +}
> +
> +using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
> +void foo2 (temp_vec_type2& v) noexcept
> +{
> +  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
> +}
> --
> 2.31.1
>
  

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..2718b0acb87 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@  expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode:
+      if (!TARGET_MMX_WITH_SSE)
+	return false;
+      mode = V4HImode;
+      gen_shr = gen_ashrv4hi3;
+      gen_shl = gen_ashlv4hi3;
+      gen_or = gen_iorv4hi3;
+      break;
+    case E_V16QImode:
+      mode = V8HImode;
+      gen_shr = gen_vlshrv8hi3;
+      gen_shl = gen_vashlv8hi3;
+      gen_or = gen_iorv8hi3;
+      break;
+    default: return false;
+    }
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -23781,6 +23842,9 @@  ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   if (expand_vec_perm_2perm_pblendv (d, false))
     return true;
+
+  if (expand_vec_perm_psrlw_psllw_por (d))
+    return true;
 
   /* Try sequences of four instructions.  */
 
diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C
new file mode 100755
index 00000000000..5b0c648e8f1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563.C
@@ -0,0 +1,23 @@ 
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}