[i386] Implement permutation with pslldq + psrldq + por when pshufb is not available.
Commit Message
pand/pandn may be used to clear upper/lower bits of the operands, in
that case there will be 4-5 instructions for permutation, and it's
still better than scalar codes.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR target/105354
* config/i386/i386-expand.cc
(expand_vec_perm_pslldq_psrldq_por): New function.
(ix86_expand_vec_perm_const_1): Try
expand_vec_perm_pslldq_psrldq_por for both 3-instruction and
4/5-instruction sequence.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr105354-1.c: New test.
* gcc.target/i386/pr105354-2.c: New test.
---
gcc/config/i386/i386-expand.cc | 109 +++++++++++++++++
gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 +++++++++++++++++
3 files changed, 349 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-2.c
Comments
On Mon, May 9, 2022 at 1:22 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> pand/pandn may be used to clear upper/lower bits of the operands, in
> that case there will be 4-5 instructions for permutation, and it's
> still better than scalar codes.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
>
> gcc/ChangeLog:
>
> PR target/105354
> * config/i386/i386-expand.cc
> (expand_vec_perm_pslldq_psrldq_por): New function.
> (ix86_expand_vec_perm_const_1): Try
> expand_vec_perm_pslldq_psrldq_por for both 3-instruction and
> 4/5-instruction sequence.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr105354-1.c: New test.
> * gcc.target/i386/pr105354-2.c: New test.
> ---
> gcc/config/i386/i386-expand.cc | 109 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 +++++++++++++++++
> 3 files changed, 349 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-2.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index bc806ffa283..49231e964ba 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -20941,6 +20941,108 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
> return true;
> }
>
> +/* Implement permutation with pslldq + psrldq + por when pshufb is not
> + available. */
> +static bool
> +expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
> +{
> + unsigned i, nelt = d->nelt;
> + unsigned start1, end1 = -1;
> + machine_mode vmode = d->vmode, imode;
> + int start2 = -1;
> + bool clear_op0, clear_op1;
> + unsigned inner_size;
> + rtx op0, op1, dop1;
> + rtx (*gen_vec_shr) (rtx, rtx, rtx);
> + rtx (*gen_vec_shl) (rtx, rtx, rtx);
> +
> + /* pshufb is available under TARGET_SSSE3. */
> + if (TARGET_SSSE3 || !TARGET_SSE2
> + /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
> + || (vmode != E_V16QImode && vmode != E_V8HImode))
> + return false;
> +
> + start1 = d->perm[0];
> + for (i = 1; i < nelt; i++)
> + {
> + if (d->perm[i] != d->perm[i-1] + 1)
> + {
> + if (start2 == -1)
> + {
> + start2 = d->perm[i];
> + end1 = d->perm[i-1];
> + }
> + else
> + return false;
> + }
> + else if (d->perm[i] >= nelt
> + && start2 == -1)
> + {
> + start2 = d->perm[i];
> + end1 = d->perm[i-1];
> + }
> + }
> +
> + clear_op0 = end1 != nelt - 1;
> + clear_op1 = start2 % nelt != 0;
> + /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
> + if (!pandn && (clear_op0 || clear_op1))
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
> + gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
> + imode = GET_MODE_INNER (vmode);
> + inner_size = GET_MODE_BITSIZE (imode);
> + op0 = gen_reg_rtx (vmode);
> + op1 = gen_reg_rtx (vmode);
> +
> + if (start1)
> + emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
> + else
> + emit_move_insn (op0, d->op0);
> +
> + dop1 = d->op1;
> + if (d->one_operand_p)
> + dop1 = d->op0;
> +
> + int shl_offset = end1 - start1 + 1 - start2 % nelt;
> + if (shl_offset)
> + emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
> + else
> + emit_move_insn (op1, dop1);
> +
> + /* Clear lower/upper bits for op0/op1. */
> + if (clear_op0 || clear_op1)
> + {
> + rtx vec[16];
> + rtx const_vec;
> + rtx clear;
> + for (i = 0; i != nelt; i++)
> + {
> + if (i < (end1 - start1 + 1))
> + vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
> + else
> + vec[i] = CONST0_RTX (imode);
> + }
> + const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
> + const_vec = validize_mem (force_const_mem (vmode, const_vec));
> + clear = force_reg (vmode, const_vec);
> +
> + if (clear_op0)
> + emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
> + if (clear_op1)
> + emit_move_insn (op1, gen_rtx_AND (vmode,
> + gen_rtx_NOT (vmode, clear),
> + op1));
> + }
> +
> + emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
> + return true;
> +}
> +
> /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
> and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
> operands with two "and" and "pack" or two "shift" and "pack" insns.
> @@ -21853,6 +21955,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> if (expand_vec_perm_pshufb2 (d))
> return true;
>
> + if (expand_vec_perm_pslldq_psrldq_por (d, false))
> + return true;
> +
> if (expand_vec_perm_interleave3 (d))
> return true;
>
> @@ -21891,6 +21996,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> if (expand_vec_perm_even_odd (d))
> return true;
>
> + /* Generate four or five instructions. */
> + if (expand_vec_perm_pslldq_psrldq_por (d, true))
> + return true;
> +
> /* Even longer sequences. */
> if (expand_vec_perm_vpshufb4_vpermq2 (d))
> return true;
> diff --git a/gcc/testsuite/gcc.target/i386/pr105354-1.c b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> new file mode 100644
> index 00000000000..8d91ded7420
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> @@ -0,0 +1,130 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> +/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */
> +/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */
> +/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */
> +/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */
> +/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */
> +
> +typedef short v8hi __attribute__((vector_size (16)));
> +typedef char v16qi __attribute__((vector_size (16)));
> +
> +v16qi
> +__attribute__((noipa))
> +foo (v16qi a, v16qi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 15, 16, 17, 18, 19, 20);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo1 (v16qi a, v16qi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 15, 18, 19, 20, 21, 22);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo2 (v16qi a, v16qi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 16, 17, 18, 19, 20, 21);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo3 (v16qi a, v16qi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 17, 18, 19, 20, 21, 22);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo4 (v8hi a, v8hi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo5 (v8hi a, v8hi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo6 (v8hi a, v8hi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo7 (v8hi a, v8hi b)
> +{
> + return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo8 (v16qi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 15, 16, 17, 18, 19, 20);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo9 (v16qi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 15, 18, 19, 20, 21, 22);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo10 (v16qi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 16, 17, 18, 19, 20, 21);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo11 (v16qi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> + 13, 14, 17, 18, 19, 20, 21, 22);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo12 (v8hi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo13 (v8hi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo14 (v8hi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo15 (v8hi a)
> +{
> + return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> new file mode 100644
> index 00000000000..b78b62e1e7e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> @@ -0,0 +1,110 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#include "sse2-check.h"
> +
> +#include "pr105354-1.c"
> +void
> +sse2_test (void)
> +{
> + union128i_b a, b, res_ab, exp_ab;
> + union128i_w c, d, res_cd, exp_cd;
> +
> + for (int i = 0; i != 16;i++)
> + {
> + a.a[i] = i;
> + b.a[i] = i + 16;
> + res_ab.a[i] = 0;
> + exp_ab.a[i] = -1;
> + if (i <= 8)
> + {
> + c.a[i] = i;
> + d.a[i] = i + 8;
> + res_cd.a[i] = 0;
> + exp_cd.a[i] = -1;
> + }
> + }
> +
> + res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x);
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort ();
> +
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22 };
> + res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x);
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort();
> +
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21 };
> + res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x);
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort();
> +
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22 };
> + res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x);
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort();
> +
> + res_ab.x = (__m128i)foo8 ((v16qi)a.x);
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4 };
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort ();
> +
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6 };
> + res_ab.x = (__m128i)foo9 ((v16qi)a.x);
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort();
> +
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5 };
> + res_ab.x = (__m128i)foo10 ((v16qi)a.x);
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort();
> +
> + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6 };
> + res_ab.x = (__m128i)foo11 ((v16qi)a.x);
> + if (check_union128i_b (exp_ab, res_ab.a))
> + abort();
> +
> + res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x);
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 };
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort ();
> +
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 };
> + res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x);
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort();
> +
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 };
> + res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x);
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort();
> +
> + res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x);
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 };
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort ();
> +
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 };
> + res_cd.x = (__m128i)foo12 ((v8hi)c.x);
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort();
> +
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 };
> + res_cd.x = (__m128i)foo13 ((v8hi)c.x);
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort();
> +
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 };
> + res_cd.x = (__m128i)foo14 ((v8hi)c.x);
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort();
> +
> + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 };
> + res_cd.x = (__m128i)foo15 ((v8hi)c.x);
> + if (check_union128i_w (exp_cd, res_cd.a))
> + abort();
> +
> +}
> +
> --
> 2.18.1
>
On Mon, May 9, 2022 at 7:24 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 1:22 PM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > pand/pandn may be used to clear upper/lower bits of the operands, in
> > that case there will be 4-5 instructions for permutation, and it's
> > still better than scalar codes.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> >
> > gcc/ChangeLog:
> >
> > PR target/105354
> > * config/i386/i386-expand.cc
> > (expand_vec_perm_pslldq_psrldq_por): New function.
> > (ix86_expand_vec_perm_const_1): Try
> > expand_vec_perm_pslldq_psrldq_por for both 3-instruction and
> > 4/5-instruction sequence.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr105354-1.c: New test.
> > * gcc.target/i386/pr105354-2.c: New test.
OK, with a slight adjustment below.
Thanks,
Uros.
> > ---
> > gcc/config/i386/i386-expand.cc | 109 +++++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 +++++++++++++++++
> > 3 files changed, 349 insertions(+)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-1.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-2.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index bc806ffa283..49231e964ba 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -20941,6 +20941,108 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
> > return true;
> > }
> >
> > +/* Implement permutation with pslldq + psrldq + por when pshufb is not
> > + available. */
> > +static bool
> > +expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
> > +{
> > + unsigned i, nelt = d->nelt;
> > + unsigned start1, end1 = -1;
> > + machine_mode vmode = d->vmode, imode;
> > + int start2 = -1;
> > + bool clear_op0, clear_op1;
> > + unsigned inner_size;
> > + rtx op0, op1, dop1;
> > + rtx (*gen_vec_shr) (rtx, rtx, rtx);
> > + rtx (*gen_vec_shl) (rtx, rtx, rtx);
> > +
> > + /* pshufb is available under TARGET_SSSE3. */
> > + if (TARGET_SSSE3 || !TARGET_SSE2
You don't have to check for TARGET_SSSE3 here. The
expand_vec_perm_pslldq_psrldq_por should be positioned in
ix86_expand_vec_perm const_1 in a place where more optimal insn
sequence is already generated when TARGET_SSSE3 is available.
> > + /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
> > + || (vmode != E_V16QImode && vmode != E_V8HImode))
> > + return false;
> > +
> > + start1 = d->perm[0];
> > + for (i = 1; i < nelt; i++)
> > + {
> > + if (d->perm[i] != d->perm[i-1] + 1)
> > + {
> > + if (start2 == -1)
> > + {
> > + start2 = d->perm[i];
> > + end1 = d->perm[i-1];
> > + }
> > + else
> > + return false;
> > + }
> > + else if (d->perm[i] >= nelt
> > + && start2 == -1)
> > + {
> > + start2 = d->perm[i];
> > + end1 = d->perm[i-1];
> > + }
> > + }
> > +
> > + clear_op0 = end1 != nelt - 1;
> > + clear_op1 = start2 % nelt != 0;
> > + /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
> > + if (!pandn && (clear_op0 || clear_op1))
> > + return false;
> > +
> > + if (d->testing_p)
> > + return true;
> > +
> > + gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
> > + gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
> > + imode = GET_MODE_INNER (vmode);
> > + inner_size = GET_MODE_BITSIZE (imode);
> > + op0 = gen_reg_rtx (vmode);
> > + op1 = gen_reg_rtx (vmode);
> > +
> > + if (start1)
> > + emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
> > + else
> > + emit_move_insn (op0, d->op0);
> > +
> > + dop1 = d->op1;
> > + if (d->one_operand_p)
> > + dop1 = d->op0;
> > +
> > + int shl_offset = end1 - start1 + 1 - start2 % nelt;
> > + if (shl_offset)
> > + emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
> > + else
> > + emit_move_insn (op1, dop1);
> > +
> > + /* Clear lower/upper bits for op0/op1. */
> > + if (clear_op0 || clear_op1)
> > + {
> > + rtx vec[16];
> > + rtx const_vec;
> > + rtx clear;
> > + for (i = 0; i != nelt; i++)
> > + {
> > + if (i < (end1 - start1 + 1))
> > + vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
> > + else
> > + vec[i] = CONST0_RTX (imode);
> > + }
> > + const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
> > + const_vec = validize_mem (force_const_mem (vmode, const_vec));
> > + clear = force_reg (vmode, const_vec);
> > +
> > + if (clear_op0)
> > + emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
> > + if (clear_op1)
> > + emit_move_insn (op1, gen_rtx_AND (vmode,
> > + gen_rtx_NOT (vmode, clear),
> > + op1));
> > + }
> > +
> > + emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
> > + return true;
> > +}
> > +
> > /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
> > and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
> > operands with two "and" and "pack" or two "shift" and "pack" insns.
> > @@ -21853,6 +21955,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> > if (expand_vec_perm_pshufb2 (d))
> > return true;
> >
> > + if (expand_vec_perm_pslldq_psrldq_por (d, false))
> > + return true;
> > +
> > if (expand_vec_perm_interleave3 (d))
> > return true;
> >
> > @@ -21891,6 +21996,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> > if (expand_vec_perm_even_odd (d))
> > return true;
> >
> > + /* Generate four or five instructions. */
> > + if (expand_vec_perm_pslldq_psrldq_por (d, true))
> > + return true;
> > +
> > /* Even longer sequences. */
> > if (expand_vec_perm_vpshufb4_vpermq2 (d))
> > return true;
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105354-1.c b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> > new file mode 100644
> > index 00000000000..8d91ded7420
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> > @@ -0,0 +1,130 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> > +/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */
> > +/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */
> > +/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */
> > +/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */
> > +/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */
> > +
> > +typedef short v8hi __attribute__((vector_size (16)));
> > +typedef char v16qi __attribute__((vector_size (16)));
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo (v16qi a, v16qi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 15, 16, 17, 18, 19, 20);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo1 (v16qi a, v16qi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 15, 18, 19, 20, 21, 22);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo2 (v16qi a, v16qi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 16, 17, 18, 19, 20, 21);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo3 (v16qi a, v16qi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 17, 18, 19, 20, 21, 22);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo4 (v8hi a, v8hi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo5 (v8hi a, v8hi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo6 (v8hi a, v8hi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo7 (v8hi a, v8hi b)
> > +{
> > + return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo8 (v16qi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 15, 16, 17, 18, 19, 20);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo9 (v16qi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 15, 18, 19, 20, 21, 22);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo10 (v16qi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 16, 17, 18, 19, 20, 21);
> > +}
> > +
> > +v16qi
> > +__attribute__((noipa))
> > +foo11 (v16qi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > + 13, 14, 17, 18, 19, 20, 21, 22);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo12 (v8hi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo13 (v8hi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo14 (v8hi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13);
> > +}
> > +
> > +v8hi
> > +__attribute__((noipa))
> > +foo15 (v8hi a)
> > +{
> > + return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> > new file mode 100644
> > index 00000000000..b78b62e1e7e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> > @@ -0,0 +1,110 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> > +/* { dg-require-effective-target sse2 } */
> > +
> > +#include "sse2-check.h"
> > +
> > +#include "pr105354-1.c"
> > +void
> > +sse2_test (void)
> > +{
> > + union128i_b a, b, res_ab, exp_ab;
> > + union128i_w c, d, res_cd, exp_cd;
> > +
> > + for (int i = 0; i != 16;i++)
> > + {
> > + a.a[i] = i;
> > + b.a[i] = i + 16;
> > + res_ab.a[i] = 0;
> > + exp_ab.a[i] = -1;
> > + if (i <= 8)
> > + {
> > + c.a[i] = i;
> > + d.a[i] = i + 8;
> > + res_cd.a[i] = 0;
> > + exp_cd.a[i] = -1;
> > + }
> > + }
> > +
> > + res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x);
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort ();
> > +
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22 };
> > + res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x);
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort();
> > +
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21 };
> > + res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x);
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort();
> > +
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22 };
> > + res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x);
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort();
> > +
> > + res_ab.x = (__m128i)foo8 ((v16qi)a.x);
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4 };
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort ();
> > +
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6 };
> > + res_ab.x = (__m128i)foo9 ((v16qi)a.x);
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort();
> > +
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5 };
> > + res_ab.x = (__m128i)foo10 ((v16qi)a.x);
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort();
> > +
> > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6 };
> > + res_ab.x = (__m128i)foo11 ((v16qi)a.x);
> > + if (check_union128i_b (exp_ab, res_ab.a))
> > + abort();
> > +
> > + res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x);
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 };
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort ();
> > +
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 };
> > + res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x);
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort();
> > +
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 };
> > + res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x);
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort();
> > +
> > + res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x);
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 };
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort ();
> > +
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 };
> > + res_cd.x = (__m128i)foo12 ((v8hi)c.x);
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort();
> > +
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 };
> > + res_cd.x = (__m128i)foo13 ((v8hi)c.x);
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort();
> > +
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 };
> > + res_cd.x = (__m128i)foo14 ((v8hi)c.x);
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort();
> > +
> > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 };
> > + res_cd.x = (__m128i)foo15 ((v8hi)c.x);
> > + if (check_union128i_w (exp_cd, res_cd.a))
> > + abort();
> > +
> > +}
> > +
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao
On Mon, May 9, 2022 at 4:19 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 7:24 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, May 9, 2022 at 1:22 PM liuhongt via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > pand/pandn may be used to clear upper/lower bits of the operands, in
> > > that case there will be 4-5 instructions for permutation, and it's
> > > still better than scalar codes.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> > >
> > >
> > > gcc/ChangeLog:
> > >
> > > PR target/105354
> > > * config/i386/i386-expand.cc
> > > (expand_vec_perm_pslldq_psrldq_por): New function.
> > > (ix86_expand_vec_perm_const_1): Try
> > > expand_vec_perm_pslldq_psrldq_por for both 3-instruction and
> > > 4/5-instruction sequence.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > * gcc.target/i386/pr105354-1.c: New test.
> > > * gcc.target/i386/pr105354-2.c: New test.
>
> OK, with a slight adjustment below.
>
> Thanks,
> Uros.
>
> > > ---
> > > gcc/config/i386/i386-expand.cc | 109 +++++++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 +++++++++++++++++
> > > 3 files changed, 349 insertions(+)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-1.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-2.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > index bc806ffa283..49231e964ba 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -20941,6 +20941,108 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
> > > return true;
> > > }
> > >
> > > +/* Implement permutation with pslldq + psrldq + por when pshufb is not
> > > + available. */
> > > +static bool
> > > +expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
> > > +{
> > > + unsigned i, nelt = d->nelt;
> > > + unsigned start1, end1 = -1;
> > > + machine_mode vmode = d->vmode, imode;
> > > + int start2 = -1;
> > > + bool clear_op0, clear_op1;
> > > + unsigned inner_size;
> > > + rtx op0, op1, dop1;
> > > + rtx (*gen_vec_shr) (rtx, rtx, rtx);
> > > + rtx (*gen_vec_shl) (rtx, rtx, rtx);
> > > +
> > > + /* pshufb is available under TARGET_SSSE3. */
> > > + if (TARGET_SSSE3 || !TARGET_SSE2
>
> You don't have to check for TARGET_SSSE3 here. The
> expand_vec_perm_pslldq_psrldq_por should be positioned in
> ix86_expand_vec_perm const_1 in a place where more optimal insn
> sequence is already generated when TARGET_SSSE3 is available.
Changed and committed, thanks for the review.
>
> > > + /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
> > > + || (vmode != E_V16QImode && vmode != E_V8HImode))
> > > + return false;
> > > +
> > > + start1 = d->perm[0];
> > > + for (i = 1; i < nelt; i++)
> > > + {
> > > + if (d->perm[i] != d->perm[i-1] + 1)
> > > + {
> > > + if (start2 == -1)
> > > + {
> > > + start2 = d->perm[i];
> > > + end1 = d->perm[i-1];
> > > + }
> > > + else
> > > + return false;
> > > + }
> > > + else if (d->perm[i] >= nelt
> > > + && start2 == -1)
> > > + {
> > > + start2 = d->perm[i];
> > > + end1 = d->perm[i-1];
> > > + }
> > > + }
> > > +
> > > + clear_op0 = end1 != nelt - 1;
> > > + clear_op1 = start2 % nelt != 0;
> > > + /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
> > > + if (!pandn && (clear_op0 || clear_op1))
> > > + return false;
> > > +
> > > + if (d->testing_p)
> > > + return true;
> > > +
> > > + gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
> > > + gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
> > > + imode = GET_MODE_INNER (vmode);
> > > + inner_size = GET_MODE_BITSIZE (imode);
> > > + op0 = gen_reg_rtx (vmode);
> > > + op1 = gen_reg_rtx (vmode);
> > > +
> > > + if (start1)
> > > + emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
> > > + else
> > > + emit_move_insn (op0, d->op0);
> > > +
> > > + dop1 = d->op1;
> > > + if (d->one_operand_p)
> > > + dop1 = d->op0;
> > > +
> > > + int shl_offset = end1 - start1 + 1 - start2 % nelt;
> > > + if (shl_offset)
> > > + emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
> > > + else
> > > + emit_move_insn (op1, dop1);
> > > +
> > > + /* Clear lower/upper bits for op0/op1. */
> > > + if (clear_op0 || clear_op1)
> > > + {
> > > + rtx vec[16];
> > > + rtx const_vec;
> > > + rtx clear;
> > > + for (i = 0; i != nelt; i++)
> > > + {
> > > + if (i < (end1 - start1 + 1))
> > > + vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
> > > + else
> > > + vec[i] = CONST0_RTX (imode);
> > > + }
> > > + const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
> > > + const_vec = validize_mem (force_const_mem (vmode, const_vec));
> > > + clear = force_reg (vmode, const_vec);
> > > +
> > > + if (clear_op0)
> > > + emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
> > > + if (clear_op1)
> > > + emit_move_insn (op1, gen_rtx_AND (vmode,
> > > + gen_rtx_NOT (vmode, clear),
> > > + op1));
> > > + }
> > > +
> > > + emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
> > > + return true;
> > > +}
> > > +
> > > /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
> > > and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
> > > operands with two "and" and "pack" or two "shift" and "pack" insns.
> > > @@ -21853,6 +21955,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> > > if (expand_vec_perm_pshufb2 (d))
> > > return true;
> > >
> > > + if (expand_vec_perm_pslldq_psrldq_por (d, false))
> > > + return true;
> > > +
> > > if (expand_vec_perm_interleave3 (d))
> > > return true;
> > >
> > > @@ -21891,6 +21996,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> > > if (expand_vec_perm_even_odd (d))
> > > return true;
> > >
> > > + /* Generate four or five instructions. */
> > > + if (expand_vec_perm_pslldq_psrldq_por (d, true))
> > > + return true;
> > > +
> > > /* Even longer sequences. */
> > > if (expand_vec_perm_vpshufb4_vpermq2 (d))
> > > return true;
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr105354-1.c b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> > > new file mode 100644
> > > index 00000000000..8d91ded7420
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> > > @@ -0,0 +1,130 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> > > +/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */
> > > +
> > > +typedef short v8hi __attribute__((vector_size (16)));
> > > +typedef char v16qi __attribute__((vector_size (16)));
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo (v16qi a, v16qi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 15, 16, 17, 18, 19, 20);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo1 (v16qi a, v16qi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 15, 18, 19, 20, 21, 22);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo2 (v16qi a, v16qi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 16, 17, 18, 19, 20, 21);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo3 (v16qi a, v16qi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 17, 18, 19, 20, 21, 22);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo4 (v8hi a, v8hi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo5 (v8hi a, v8hi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo6 (v8hi a, v8hi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo7 (v8hi a, v8hi b)
> > > +{
> > > + return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo8 (v16qi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 15, 16, 17, 18, 19, 20);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo9 (v16qi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 15, 18, 19, 20, 21, 22);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo10 (v16qi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 16, 17, 18, 19, 20, 21);
> > > +}
> > > +
> > > +v16qi
> > > +__attribute__((noipa))
> > > +foo11 (v16qi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> > > + 13, 14, 17, 18, 19, 20, 21, 22);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo12 (v8hi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo13 (v8hi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo14 (v8hi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13);
> > > +}
> > > +
> > > +v8hi
> > > +__attribute__((noipa))
> > > +foo15 (v8hi a)
> > > +{
> > > + return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14);
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> > > new file mode 100644
> > > index 00000000000..b78b62e1e7e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> > > @@ -0,0 +1,110 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> > > +/* { dg-require-effective-target sse2 } */
> > > +
> > > +#include "sse2-check.h"
> > > +
> > > +#include "pr105354-1.c"
> > > +void
> > > +sse2_test (void)
> > > +{
> > > + union128i_b a, b, res_ab, exp_ab;
> > > + union128i_w c, d, res_cd, exp_cd;
> > > +
> > > + for (int i = 0; i != 16;i++)
> > > + {
> > > + a.a[i] = i;
> > > + b.a[i] = i + 16;
> > > + res_ab.a[i] = 0;
> > > + exp_ab.a[i] = -1;
> > > + if (i <= 8)
> > > + {
> > > + c.a[i] = i;
> > > + d.a[i] = i + 8;
> > > + res_cd.a[i] = 0;
> > > + exp_cd.a[i] = -1;
> > > + }
> > > + }
> > > +
> > > + res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x);
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort ();
> > > +
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22 };
> > > + res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x);
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort();
> > > +
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21 };
> > > + res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x);
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort();
> > > +
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22 };
> > > + res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x);
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort();
> > > +
> > > + res_ab.x = (__m128i)foo8 ((v16qi)a.x);
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4 };
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort ();
> > > +
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6 };
> > > + res_ab.x = (__m128i)foo9 ((v16qi)a.x);
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort();
> > > +
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5 };
> > > + res_ab.x = (__m128i)foo10 ((v16qi)a.x);
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort();
> > > +
> > > + exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6 };
> > > + res_ab.x = (__m128i)foo11 ((v16qi)a.x);
> > > + if (check_union128i_b (exp_ab, res_ab.a))
> > > + abort();
> > > +
> > > + res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x);
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 };
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort ();
> > > +
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 };
> > > + res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x);
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort();
> > > +
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 };
> > > + res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x);
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort();
> > > +
> > > + res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x);
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 };
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort ();
> > > +
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 };
> > > + res_cd.x = (__m128i)foo12 ((v8hi)c.x);
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort();
> > > +
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 };
> > > + res_cd.x = (__m128i)foo13 ((v8hi)c.x);
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort();
> > > +
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 };
> > > + res_cd.x = (__m128i)foo14 ((v8hi)c.x);
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort();
> > > +
> > > + exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 };
> > > + res_cd.x = (__m128i)foo15 ((v8hi)c.x);
> > > + if (check_union128i_w (exp_cd, res_cd.a))
> > > + abort();
> > > +
> > > +}
> > > +
> > > --
> > > 2.18.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
@@ -20941,6 +20941,108 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
return true;
}
+/* Implement permutation with pslldq + psrldq + por when pshufb is not
+ available. */
+static bool
+expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
+{
+ unsigned i, nelt = d->nelt;
+ unsigned start1, end1 = -1;
+ machine_mode vmode = d->vmode, imode;
+ int start2 = -1;
+ bool clear_op0, clear_op1;
+ unsigned inner_size;
+ rtx op0, op1, dop1;
+ rtx (*gen_vec_shr) (rtx, rtx, rtx);
+ rtx (*gen_vec_shl) (rtx, rtx, rtx);
+
+ /* pshufb is available under TARGET_SSSE3. */
+ if (TARGET_SSSE3 || !TARGET_SSE2
+ /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
+ || (vmode != E_V16QImode && vmode != E_V8HImode))
+ return false;
+
+ start1 = d->perm[0];
+ for (i = 1; i < nelt; i++)
+ {
+ if (d->perm[i] != d->perm[i-1] + 1)
+ {
+ if (start2 == -1)
+ {
+ start2 = d->perm[i];
+ end1 = d->perm[i-1];
+ }
+ else
+ return false;
+ }
+ else if (d->perm[i] >= nelt
+ && start2 == -1)
+ {
+ start2 = d->perm[i];
+ end1 = d->perm[i-1];
+ }
+ }
+
+ clear_op0 = end1 != nelt - 1;
+ clear_op1 = start2 % nelt != 0;
+ /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
+ if (!pandn && (clear_op0 || clear_op1))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
+ gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
+ imode = GET_MODE_INNER (vmode);
+ inner_size = GET_MODE_BITSIZE (imode);
+ op0 = gen_reg_rtx (vmode);
+ op1 = gen_reg_rtx (vmode);
+
+ if (start1)
+ emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
+ else
+ emit_move_insn (op0, d->op0);
+
+ dop1 = d->op1;
+ if (d->one_operand_p)
+ dop1 = d->op0;
+
+ int shl_offset = end1 - start1 + 1 - start2 % nelt;
+ if (shl_offset)
+ emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
+ else
+ emit_move_insn (op1, dop1);
+
+ /* Clear lower/upper bits for op0/op1. */
+ if (clear_op0 || clear_op1)
+ {
+ rtx vec[16];
+ rtx const_vec;
+ rtx clear;
+ for (i = 0; i != nelt; i++)
+ {
+ if (i < (end1 - start1 + 1))
+ vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
+ else
+ vec[i] = CONST0_RTX (imode);
+ }
+ const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
+ const_vec = validize_mem (force_const_mem (vmode, const_vec));
+ clear = force_reg (vmode, const_vec);
+
+ if (clear_op0)
+ emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
+ if (clear_op1)
+ emit_move_insn (op1, gen_rtx_AND (vmode,
+ gen_rtx_NOT (vmode, clear),
+ op1));
+ }
+
+ emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
+ return true;
+}
+
/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
operands with two "and" and "pack" or two "shift" and "pack" insns.
@@ -21853,6 +21955,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_pshufb2 (d))
return true;
+ if (expand_vec_perm_pslldq_psrldq_por (d, false))
+ return true;
+
if (expand_vec_perm_interleave3 (d))
return true;
@@ -21891,6 +21996,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_even_odd (d))
return true;
+ /* Generate four or five instructions. */
+ if (expand_vec_perm_pslldq_psrldq_por (d, true))
+ return true;
+
/* Even longer sequences. */
if (expand_vec_perm_vpshufb4_vpermq2 (d))
return true;
new file mode 100644
@@ -0,0 +1,130 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-ssse3" } */
+/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */
+/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */
+/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */
+/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */
+/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */
+
+typedef short v8hi __attribute__((vector_size (16)));
+typedef char v16qi __attribute__((vector_size (16)));
+
+v16qi
+__attribute__((noipa))
+foo (v16qi a, v16qi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20);
+}
+
+v16qi
+__attribute__((noipa))
+foo1 (v16qi a, v16qi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 18, 19, 20, 21, 22);
+}
+
+v16qi
+__attribute__((noipa))
+foo2 (v16qi a, v16qi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 16, 17, 18, 19, 20, 21);
+}
+
+v16qi
+__attribute__((noipa))
+foo3 (v16qi a, v16qi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 17, 18, 19, 20, 21, 22);
+}
+
+v8hi
+__attribute__((noipa))
+foo4 (v8hi a, v8hi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12);
+}
+
+v8hi
+__attribute__((noipa))
+foo5 (v8hi a, v8hi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13);
+}
+
+v8hi
+__attribute__((noipa))
+foo6 (v8hi a, v8hi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13);
+}
+
+v8hi
+__attribute__((noipa))
+foo7 (v8hi a, v8hi b)
+{
+ return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14);
+}
+
+v16qi
+__attribute__((noipa))
+foo8 (v16qi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20);
+}
+
+v16qi
+__attribute__((noipa))
+foo9 (v16qi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 18, 19, 20, 21, 22);
+}
+
+v16qi
+__attribute__((noipa))
+foo10 (v16qi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 16, 17, 18, 19, 20, 21);
+}
+
+v16qi
+__attribute__((noipa))
+foo11 (v16qi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 17, 18, 19, 20, 21, 22);
+}
+
+v8hi
+__attribute__((noipa))
+foo12 (v8hi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12);
+}
+
+v8hi
+__attribute__((noipa))
+foo13 (v8hi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13);
+}
+
+v8hi
+__attribute__((noipa))
+foo14 (v8hi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13);
+}
+
+v8hi
+__attribute__((noipa))
+foo15 (v8hi a)
+{
+ return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14);
+}
new file mode 100644
@@ -0,0 +1,110 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -mno-ssse3" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+
+#include "pr105354-1.c"
+void
+sse2_test (void)
+{
+ union128i_b a, b, res_ab, exp_ab;
+ union128i_w c, d, res_cd, exp_cd;
+
+ for (int i = 0; i != 16;i++)
+ {
+ a.a[i] = i;
+ b.a[i] = i + 16;
+ res_ab.a[i] = 0;
+ exp_ab.a[i] = -1;
+ if (i <= 8)
+ {
+ c.a[i] = i;
+ d.a[i] = i + 8;
+ res_cd.a[i] = 0;
+ exp_cd.a[i] = -1;
+ }
+ }
+
+ res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x);
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort ();
+
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22 };
+ res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x);
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort();
+
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21 };
+ res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x);
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort();
+
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22 };
+ res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x);
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort();
+
+ res_ab.x = (__m128i)foo8 ((v16qi)a.x);
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4 };
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort ();
+
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6 };
+ res_ab.x = (__m128i)foo9 ((v16qi)a.x);
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort();
+
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5 };
+ res_ab.x = (__m128i)foo10 ((v16qi)a.x);
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort();
+
+ exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6 };
+ res_ab.x = (__m128i)foo11 ((v16qi)a.x);
+ if (check_union128i_b (exp_ab, res_ab.a))
+ abort();
+
+ res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x);
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 };
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort ();
+
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 };
+ res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x);
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort();
+
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 };
+ res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x);
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort();
+
+ res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x);
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 };
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort ();
+
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 };
+ res_cd.x = (__m128i)foo12 ((v8hi)c.x);
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort();
+
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 };
+ res_cd.x = (__m128i)foo13 ((v8hi)c.x);
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort();
+
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 };
+ res_cd.x = (__m128i)foo14 ((v8hi)c.x);
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort();
+
+ exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 };
+ res_cd.x = (__m128i)foo15 ((v8hi)c.x);
+ if (check_union128i_w (exp_cd, res_cd.a))
+ abort();
+
+}
+