i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))

Message ID 20220922071944.1472123-1-lin1.hu@intel.com
State Committed
Commit a282f086ef26d90e9785e992cd09a0d118b24695
Headers
Series i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) |

Commit Message

Li, Pan2 via Gcc-patches Sept. 22, 2022, 7:19 a.m. UTC
  Hi all,

This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result.

Regtested on x86_64-pc-linux-gnu. Ok for trunk?

BRs,
Lin

gcc/ChangeLog:

	PR target/94962
	* config/i386/constraints.md (BH): New define_constraint.
	* config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate.
	(standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd".
	* config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate.
	(vector_all_ones_zero_extend_quarter_operand): Ditto.
	* config/i386/sse.md: Add constraint to insn "mov<mode>_internal".

gcc/testsuite/ChangeLog:

	PR target/94962
	* gcc.target/i386/avx256-unaligned-load-1.c: Modify test.
	* gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
	* gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
	* gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
        * gcc.target/i386/pr94962-1.c: New test.
        * gcc.target/i386/pr94962-2.c: Ditto.
        * gcc.target/i386/pr94962-3.c: Ditto.
        * gcc.target/i386/pr94962-4.c: Ditto.
---
 gcc/config/i386/constraints.md                |  8 +++
 gcc/config/i386/i386.cc                       | 26 +++++++-
 gcc/config/i386/predicates.md                 | 49 ++++++++++++++
 gcc/config/i386/sse.md                        |  8 +--
 .../gcc.target/i386/avx256-unaligned-load-1.c |  4 +-
 .../i386/avx256-unaligned-store-1.c           |  4 +-
 .../i386/avx256-unaligned-store-2.c           |  4 +-
 .../i386/avx256-unaligned-store-3.c           |  4 +-
 gcc/testsuite/gcc.target/i386/pr94962-1.c     | 11 ++++
 gcc/testsuite/gcc.target/i386/pr94962-2.c     | 17 +++++
 gcc/testsuite/gcc.target/i386/pr94962-3.c     | 64 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr94962-4.c     | 49 ++++++++++++++
 12 files changed, 235 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c
  

Comments

Hongtao Liu Sept. 23, 2022, 1:47 a.m. UTC | #1
On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi all,
>
> This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
>         PR target/94962
>         * config/i386/constraints.md (BH): New define_constraint.
>         * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate.
>         (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd".
>         * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate.
>         (vector_all_ones_zero_extend_quarter_operand): Ditto.
>         * config/i386/sse.md: Add constraint to insn "mov<mode>_internal".
(mov<mode>_internal): Add new constraint BH.
Put the insn name at first.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/94962
>         * gcc.target/i386/avx256-unaligned-load-1.c: Modify test.
>         * gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
>         * gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
>         * gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
>         * gcc.target/i386/pr94962-1.c: New test.
>         * gcc.target/i386/pr94962-2.c: Ditto.
>         * gcc.target/i386/pr94962-3.c: Ditto.
>         * gcc.target/i386/pr94962-4.c: Ditto.
> ---
>  gcc/config/i386/constraints.md                |  8 +++
>  gcc/config/i386/i386.cc                       | 26 +++++++-
>  gcc/config/i386/predicates.md                 | 49 ++++++++++++++
>  gcc/config/i386/sse.md                        |  8 +--
>  .../gcc.target/i386/avx256-unaligned-load-1.c |  4 +-
>  .../i386/avx256-unaligned-store-1.c           |  4 +-
>  .../i386/avx256-unaligned-store-2.c           |  4 +-
>  .../i386/avx256-unaligned-store-3.c           |  4 +-
>  gcc/testsuite/gcc.target/i386/pr94962-1.c     | 11 ++++
>  gcc/testsuite/gcc.target/i386/pr94962-2.c     | 17 +++++
>  gcc/testsuite/gcc.target/i386/pr94962-3.c     | 64 +++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr94962-4.c     | 49 ++++++++++++++
>  12 files changed, 235 insertions(+), 13 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c
>
> diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
> index 7361687632f..95b2b142d41 100644
> --- a/gcc/config/i386/constraints.md
> +++ b/gcc/config/i386/constraints.md
> @@ -168,6 +168,9 @@
>  ;;  z  Constant call address operand.
>  ;;  C  Integer SSE constant with all bits set operand.
>  ;;  F  Floating-point SSE constant with all bits set operand.
> +;;  H  Integer SSE constant that is 128/256bit all ones
> +;;     and zero-extand to 256/512bit, or 128bit all ones
> +;;     and zero-extend to 512bit.
>  ;;  M  x86-64 memory operand.
>
>  (define_constraint "Bf"
> @@ -233,6 +236,11 @@
>    (and (match_test "TARGET_SSE")
>         (match_operand 0 "float_vector_all_ones_operand")))
>
> +(define_constraint "BH"
> +  "@internal integer constant with last half/quarter bits set operand."
> +  (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
> +       (match_operand 0 "vector_all_ones_zero_extend_quarter_operand")))
> +
>  ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64
>  ;; to prevent LRA from converting the operand to the form '(mem (reg X))'
>  ;; where X is a base register.
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index dadf453d6c0..ca799da5d7e 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx)
>                                        XFmode);
>  }
>
> -/* Return 1 if X is all bits 0 and 2 if X is all bits 1
> +/* Return 1 if X is all bits 0, 2 if X is all bits 1
> +   and 3 if X is all bits 1 with zero extend
>     in supported SSE/AVX vector mode.  */
>
>  int
> @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
>         }
>      }
>
> +  if (vector_all_ones_zero_extend_half_operand (x, mode)
> +      || vector_all_ones_zero_extend_quarter_operand (x, mode))
> +    return 3;
> +
>    return 0;
>  }
>
> @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
>           gcc_unreachable ();
>         }
>     }
> +  else if (vector_all_ones_zero_extend_half_operand (x, mode))
> +    {
> +      if (GET_MODE_SIZE (mode) == 64)
> +       {
> +         gcc_assert (TARGET_AVX512F);
> +         return "vpcmpeqd \t %t0, %t0, %t0";
> +       }
> +      else if (GET_MODE_SIZE (mode) == 32)
> +       {
> +         gcc_assert (TARGET_AVX);
> +         return "vpcmpeqd \t %x0, %x0, %x0";
> +       }
> +      gcc_unreachable ();
> +    }
> +  else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
> +    {
> +      gcc_assert (TARGET_AVX512F);
> +      return "vpcmpeqd \t %x0, %x0, %x0";
> +    }
>
Can we merge 2 vpcmpeqd \t %x0, %x0, %x0"; into 1?
like
else if (vector_all_ones_zero_extend_half_operand (x, mode)
            && GET_MODE_SIZE(mode) == 64))
  return "vpcmpeqd \t %t0, %t0, %t0";
else if ((vector_all_ones_zero_extend_half_operand (x, mode)
            && GET_MODE_SIZE (mode) == 32)
           || vector_all_ones_zero_extend_quarter_operand (x, mode))
    return "vpcmpeqd \t %x0, %x0, %x0";

>    gcc_unreachable ();
>  }
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index 4f16bb748b5..655eabf793b 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1159,6 +1159,55 @@
>         (match_test "INTEGRAL_MODE_P (GET_MODE (op))")
>         (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
>
> +/* Return true if operand is an 128/256bit all ones vector
> +   that zero-extends to 256/512bit.  */
> +(define_predicate "vector_all_ones_zero_extend_half_operand"
> +  (match_code "const_vector")
> +{
> +  mode = GET_MODE (op);
> +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
> +      || (GET_MODE_SIZE (mode) != 32
> +         && GET_MODE_SIZE (mode) != 64))
> +    return false;
> +
> +  int nelts = CONST_VECTOR_NUNITS (op);
> +  for (int i = 0; i != nelts; i++)
> +    {
> +      rtx elt = CONST_VECTOR_ELT (op, i);
> +      if (i < nelts / 2
> +         && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +      if (i >= nelts / 2
> +         && elt != CONST0_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +    }
> +  return true;
> +})
> +
> +/* Return true if operand is an 128bit all ones vector
> +   that zero extends to 512bit.  */
> +(define_predicate "vector_all_ones_zero_extend_quarter_operand"
> +  (match_code "const_vector")
> +{
> +  mode = GET_MODE (op);
> +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
> +      || GET_MODE_SIZE (mode) != 64)
> +    return false;
> +
> +  int nelts = CONST_VECTOR_NUNITS (op);
> +  for (int i = 0; i != nelts; i++)
> +    {
> +      rtx elt = CONST_VECTOR_ELT (op, i);
> +      if (i < nelts / 4
> +         && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +      if (i >= nelts / 4
> +         && elt != CONST0_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +    }
> +  return true;
> +})
> +
>  ; Return true when OP is operand acceptable for vector memory operand.
>  ; Only AVX can have misaligned memory operand.
>  (define_predicate "vector_memory_operand"
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index d535c0af043..f804dbe9b7a 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1318,9 +1318,9 @@
>
>  (define_insn "mov<mode>_internal"
>    [(set (match_operand:VMOVE 0 "nonimmediate_operand"
> -        "=v,v ,v ,m")
> +        "=v,v ,v,v ,m")
>         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
> -        " C,<sseconstm1>,vm,v"))]
> +        " C,<sseconstm1>,BH,vm,v"))]
>    "TARGET_SSE
>     && (register_operand (operands[0], <MODE>mode)
>         || register_operand (operands[1], <MODE>mode))
> @@ -1338,7 +1338,7 @@
>        gcc_unreachable ();
>      }
>  }
> -  [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
> +  [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov")
>     (set_attr "prefix" "maybe_vex")
>     (set (attr "mode")
>         (cond [(match_test "TARGET_AVX")
> @@ -1349,7 +1349,7 @@
>                (and (match_test "<MODE>mode == V2DFmode")
>                     (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
>                  (const_string "V4SF")
> -              (and (eq_attr "alternative" "3")
> +              (and (eq_attr "alternative" "4")
>                     (match_test "TARGET_SSE_TYPELESS_STORES"))
>                  (const_string "V4SF")
>                (and (eq_attr "alternative" "0")
> diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> index 68378a556fb..7115b0a9dde 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> @@ -14,6 +14,6 @@ avx_test (void)
>      c[i] = a[i] * b[i+3];
>  }
>
> -/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/2" } } */
> -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */
> +/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/3" } } */
> +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */
>  /* { dg-final { scan-assembler "vinsertf128" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> index d82aecffda9..4c713959df2 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> @@ -17,6 +17,6 @@ avx_test (void)
>      d[i] = c[i] * 20.0;
>  }
>
> -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } */
> -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */
> +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } */
> +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */
>  /* { dg-final { scan-assembler "vextractf128" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> index be12529e8d5..4978c37f526 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> @@ -23,6 +23,6 @@ avx_test (void)
>      }
>  }
>
> -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } */
> -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */
> +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } */
> +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */
>  /* { dg-final { scan-assembler "vextract.128" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> index 918028df9ed..f909099bcb1 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> @@ -17,6 +17,6 @@ avx_test (void)
>      d[i] = c[i] * 20.0;
>  }
>
> -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */
> -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */
> +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } */
> +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */
>  /* { dg-final { scan-assembler "vextractf128" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-1.c b/gcc/testsuite/gcc.target/i386/pr94962-1.c
> new file mode 100644
> index 00000000000..e3b01249421
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */
> +
> +#include <immintrin.h>
> +
> +__m256i mask()
> +{
> +  return _mm256_zextsi128_si256(_mm_set1_epi8(-1));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c b/gcc/testsuite/gcc.target/i386/pr94962-2.c
> new file mode 100644
> index 00000000000..4e10e927ba1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 } } */
> +
> +#include <immintrin.h>
> +
> +__m512i mask1()
> +{
> +  return _mm512_zextsi128_si512(_mm_set1_epi8(-1));
> +}
> +
> +__m512i mask2()
> +{
> +  return _mm512_zextsi256_si512(_mm256_set1_epi8(-1));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c b/gcc/testsuite/gcc.target/i386/pr94962-3.c
> new file mode 100644
> index 00000000000..8d0b9974435
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c
> @@ -0,0 +1,64 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 } } */
> +
> +typedef long long __v8di __attribute__ ((__vector_size__ (64)));
> +typedef int __v16si __attribute__ ((__vector_size__ (64)));
> +typedef short __v32hi __attribute__ ((__vector_size__ (64)));
> +typedef char __v64qi __attribute__ ((__vector_size__ (64)));
> +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo1 ()
> +{
> +  return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1,
> +                                          0, 0, 0, 0 };
> +}
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo2 ()
> +{
> +  return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 };
> +}
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo3 ()
> +{
> +  return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 };
> +}
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo4 ()
> +{
> +  return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c b/gcc/testsuite/gcc.target/i386/pr94962-4.c
> new file mode 100644
> index 00000000000..5502c39910b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c
> @@ -0,0 +1,49 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 } } */
> +
> +typedef long long __v4di __attribute__ ((__vector_size__ (32)));
> +typedef int __v8si __attribute__ ((__vector_size__ (32)));
> +typedef short __v16hi __attribute__ ((__vector_size__ (32)));
> +typedef char __v32qi __attribute__ ((__vector_size__ (32)));
> +typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__));
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo1 ()
> +{
> +  return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 };
> +}
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo2 ()
> +{
> +  return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1,
> +                                           0, 0, 0, 0 };
> +}
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo3 ()
> +{
> +  return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 };
> +}
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo4 ()
> +{
> +  return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 };
> +}
> --
> 2.18.2
>

Others LGTM.
  
Li, Pan2 via Gcc-patches Sept. 23, 2022, 3:07 a.m. UTC | #2
Hi, Hongtao

I have modefied this patch and regtested on x86_64-pc-linux-gnu.

BRs.
Lin

-----Original Message-----
From: Hongtao Liu <crazylht@gmail.com> 
Sent: Friday, September 23, 2022 9:48 AM
To: Hu, Lin1 <lin1.hu@intel.com>
Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>
Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))

On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>
> Hi all,
>
> This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
>         PR target/94962
>         * config/i386/constraints.md (BH): New define_constraint.
>         * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate.
>         (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd".
>         * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate.
>         (vector_all_ones_zero_extend_quarter_operand): Ditto.
>         * config/i386/sse.md: Add constraint to insn "mov<mode>_internal".
(mov<mode>_internal): Add new constraint BH.
Put the insn name at first.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/94962
>         * gcc.target/i386/avx256-unaligned-load-1.c: Modify test.
>         * gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
>         * gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
>         * gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
>         * gcc.target/i386/pr94962-1.c: New test.
>         * gcc.target/i386/pr94962-2.c: Ditto.
>         * gcc.target/i386/pr94962-3.c: Ditto.
>         * gcc.target/i386/pr94962-4.c: Ditto.
> ---
>  gcc/config/i386/constraints.md                |  8 +++
>  gcc/config/i386/i386.cc                       | 26 +++++++-
>  gcc/config/i386/predicates.md                 | 49 ++++++++++++++
>  gcc/config/i386/sse.md                        |  8 +--
>  .../gcc.target/i386/avx256-unaligned-load-1.c |  4 +-
>  .../i386/avx256-unaligned-store-1.c           |  4 +-
>  .../i386/avx256-unaligned-store-2.c           |  4 +-
>  .../i386/avx256-unaligned-store-3.c           |  4 +-
>  gcc/testsuite/gcc.target/i386/pr94962-1.c     | 11 ++++
>  gcc/testsuite/gcc.target/i386/pr94962-2.c     | 17 +++++
>  gcc/testsuite/gcc.target/i386/pr94962-3.c     | 64 +++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr94962-4.c     | 49 ++++++++++++++
>  12 files changed, 235 insertions(+), 13 deletions(-)  create mode 
> 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c
>
> diff --git a/gcc/config/i386/constraints.md 
> b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644
> --- a/gcc/config/i386/constraints.md
> +++ b/gcc/config/i386/constraints.md
> @@ -168,6 +168,9 @@
>  ;;  z  Constant call address operand.
>  ;;  C  Integer SSE constant with all bits set operand.
>  ;;  F  Floating-point SSE constant with all bits set operand.
> +;;  H  Integer SSE constant that is 128/256bit all ones
> +;;     and zero-extand to 256/512bit, or 128bit all ones
> +;;     and zero-extend to 512bit.
>  ;;  M  x86-64 memory operand.
>
>  (define_constraint "Bf"
> @@ -233,6 +236,11 @@
>    (and (match_test "TARGET_SSE")
>         (match_operand 0 "float_vector_all_ones_operand")))
>
> +(define_constraint "BH"
> +  "@internal integer constant with last half/quarter bits set operand."
> +  (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
> +       (match_operand 0 
> +"vector_all_ones_zero_extend_quarter_operand")))
> +
>  ;; NB: Similar to 'm', but don't use define_memory_constraint on 
> x86-64  ;; to prevent LRA from converting the operand to the form '(mem (reg X))'
>  ;; where X is a base register.
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
> dadf453d6c0..ca799da5d7e 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx)
>                                        XFmode);  }
>
> -/* Return 1 if X is all bits 0 and 2 if X is all bits 1
> +/* Return 1 if X is all bits 0, 2 if X is all bits 1
> +   and 3 if X is all bits 1 with zero extend
>     in supported SSE/AVX vector mode.  */
>
>  int
> @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
>         }
>      }
>
> +  if (vector_all_ones_zero_extend_half_operand (x, mode)
> +      || vector_all_ones_zero_extend_quarter_operand (x, mode))
> +    return 3;
> +
>    return 0;
>  }
>
> @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
>           gcc_unreachable ();
>         }
>     }
> +  else if (vector_all_ones_zero_extend_half_operand (x, mode))
> +    {
> +      if (GET_MODE_SIZE (mode) == 64)
> +       {
> +         gcc_assert (TARGET_AVX512F);
> +         return "vpcmpeqd \t %t0, %t0, %t0";
> +       }
> +      else if (GET_MODE_SIZE (mode) == 32)
> +       {
> +         gcc_assert (TARGET_AVX);
> +         return "vpcmpeqd \t %x0, %x0, %x0";
> +       }
> +      gcc_unreachable ();
> +    }
> +  else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
> +    {
> +      gcc_assert (TARGET_AVX512F);
> +      return "vpcmpeqd \t %x0, %x0, %x0";
> +    }
>
Can we merge 2 vpcmpeqd \t %x0, %x0, %x0"; into 1?
like
else if (vector_all_ones_zero_extend_half_operand (x, mode)
            && GET_MODE_SIZE(mode) == 64))
  return "vpcmpeqd \t %t0, %t0, %t0";
else if ((vector_all_ones_zero_extend_half_operand (x, mode)
            && GET_MODE_SIZE (mode) == 32)
           || vector_all_ones_zero_extend_quarter_operand (x, mode))
    return "vpcmpeqd \t %x0, %x0, %x0";

>    gcc_unreachable ();
>  }
> diff --git a/gcc/config/i386/predicates.md 
> b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1159,6 +1159,55 @@
>         (match_test "INTEGRAL_MODE_P (GET_MODE (op))")
>         (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
>
> +/* Return true if operand is an 128/256bit all ones vector
> +   that zero-extends to 256/512bit.  */ (define_predicate 
> +"vector_all_ones_zero_extend_half_operand"
> +  (match_code "const_vector")
> +{
> +  mode = GET_MODE (op);
> +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
> +      || (GET_MODE_SIZE (mode) != 32
> +         && GET_MODE_SIZE (mode) != 64))
> +    return false;
> +
> +  int nelts = CONST_VECTOR_NUNITS (op);
> +  for (int i = 0; i != nelts; i++)
> +    {
> +      rtx elt = CONST_VECTOR_ELT (op, i);
> +      if (i < nelts / 2
> +         && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +      if (i >= nelts / 2
> +         && elt != CONST0_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +    }
> +  return true;
> +})
> +
> +/* Return true if operand is an 128bit all ones vector
> +   that zero extends to 512bit.  */
> +(define_predicate "vector_all_ones_zero_extend_quarter_operand"
> +  (match_code "const_vector")
> +{
> +  mode = GET_MODE (op);
> +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
> +      || GET_MODE_SIZE (mode) != 64)
> +    return false;
> +
> +  int nelts = CONST_VECTOR_NUNITS (op);
> +  for (int i = 0; i != nelts; i++)
> +    {
> +      rtx elt = CONST_VECTOR_ELT (op, i);
> +      if (i < nelts / 4
> +         && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +      if (i >= nelts / 4
> +         && elt != CONST0_RTX (GET_MODE_INNER (mode)))
> +       return false;
> +    }
> +  return true;
> +})
> +
>  ; Return true when OP is operand acceptable for vector memory operand.
>  ; Only AVX can have misaligned memory operand.
>  (define_predicate "vector_memory_operand"
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
> d535c0af043..f804dbe9b7a 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1318,9 +1318,9 @@
>
>  (define_insn "mov<mode>_internal"
>    [(set (match_operand:VMOVE 0 "nonimmediate_operand"
> -        "=v,v ,v ,m")
> +        "=v,v ,v,v ,m")
>         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
> -        " C,<sseconstm1>,vm,v"))]
> +        " C,<sseconstm1>,BH,vm,v"))]
>    "TARGET_SSE
>     && (register_operand (operands[0], <MODE>mode)
>         || register_operand (operands[1], <MODE>mode)) @@ -1338,7 
> +1338,7 @@
>        gcc_unreachable ();
>      }
>  }
> -  [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
> +  [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov")
>     (set_attr "prefix" "maybe_vex")
>     (set (attr "mode")
>         (cond [(match_test "TARGET_AVX") @@ -1349,7 +1349,7 @@
>                (and (match_test "<MODE>mode == V2DFmode")
>                     (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
>                  (const_string "V4SF")
> -              (and (eq_attr "alternative" "3")
> +              (and (eq_attr "alternative" "4")
>                     (match_test "TARGET_SSE_TYPELESS_STORES"))
>                  (const_string "V4SF")
>                (and (eq_attr "alternative" "0") diff --git 
> a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c 
> b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> index 68378a556fb..7115b0a9dde 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> @@ -14,6 +14,6 @@ avx_test (void)
>      c[i] = a[i] * b[i+3];
>  }
>
> -/* { dg-final { scan-assembler-not 
> "vmovups\[^\n\r]*movv8sf_internal/2" } } */
> -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */
> +/* { dg-final { scan-assembler-not 
> +"vmovups\[^\n\r]*movv8sf_internal/3" } } */
> +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */
>  /* { dg-final { scan-assembler "vinsertf128" } } */ diff --git 
> a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c 
> b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> index d82aecffda9..4c713959df2 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> @@ -17,6 +17,6 @@ avx_test (void)
>      d[i] = c[i] * 20.0;
>  }
>
> -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } 
> */
> -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */
> +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } 
> +*/
> +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */
>  /* { dg-final { scan-assembler "vextractf128" } } */ diff --git 
> a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c 
> b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> index be12529e8d5..4978c37f526 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> @@ -23,6 +23,6 @@ avx_test (void)
>      }
>  }
>
> -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } 
> */
> -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */
> +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } 
> +*/
> +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */
>  /* { dg-final { scan-assembler "vextract.128" } } */ diff --git 
> a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c 
> b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> index 918028df9ed..f909099bcb1 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> @@ -17,6 +17,6 @@ avx_test (void)
>      d[i] = c[i] * 20.0;
>  }
>
> -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } 
> */
> -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */
> +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } 
> +*/
> +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */
>  /* { dg-final { scan-assembler "vextractf128" } } */ diff --git 
> a/gcc/testsuite/gcc.target/i386/pr94962-1.c 
> b/gcc/testsuite/gcc.target/i386/pr94962-1.c
> new file mode 100644
> index 00000000000..e3b01249421
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 
> +} } */
> +
> +#include <immintrin.h>
> +
> +__m256i mask()
> +{
> +  return _mm256_zextsi128_si256(_mm_set1_epi8(-1));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c 
> b/gcc/testsuite/gcc.target/i386/pr94962-2.c
> new file mode 100644
> index 00000000000..4e10e927ba1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 
> +} } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 
> +} } */
> +
> +#include <immintrin.h>
> +
> +__m512i mask1()
> +{
> +  return _mm512_zextsi128_si512(_mm_set1_epi8(-1));
> +}
> +
> +__m512i mask2()
> +{
> +  return _mm512_zextsi256_si512(_mm256_set1_epi8(-1));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c 
> b/gcc/testsuite/gcc.target/i386/pr94962-3.c
> new file mode 100644
> index 00000000000..8d0b9974435
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c
> @@ -0,0 +1,64 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 
> +} } */
> +
> +typedef long long __v8di __attribute__ ((__vector_size__ (64))); 
> +typedef int __v16si __attribute__ ((__vector_size__ (64))); typedef 
> +short __v32hi __attribute__ ((__vector_size__ (64))); typedef char 
> +__v64qi __attribute__ ((__vector_size__ (64))); typedef long long 
> +__m512i __attribute__ ((__vector_size__ (64), __may_alias__));
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo1 ()
> +{
> +  return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1,
> +                                          0, 0, 0, 0 }; }
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo2 ()
> +{
> +  return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 }; }
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo3 ()
> +{
> +  return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 }; }
> +
> +__m512i
> +__attribute__ ((noinline, noclone))
> +foo4 ()
> +{
> +  return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 }; }
> diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c 
> b/gcc/testsuite/gcc.target/i386/pr94962-4.c
> new file mode 100644
> index 00000000000..5502c39910b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c
> @@ -0,0 +1,49 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 
> +} } */
> +
> +typedef long long __v4di __attribute__ ((__vector_size__ (32))); 
> +typedef int __v8si __attribute__ ((__vector_size__ (32))); typedef 
> +short __v16hi __attribute__ ((__vector_size__ (32))); typedef char 
> +__v32qi __attribute__ ((__vector_size__ (32))); typedef long long 
> +__m256i __attribute__ ((__vector_size__ (32), __may_alias__));
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo1 ()
> +{
> +  return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; }
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo2 ()
> +{
> +  return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1,
> +                                           0, 0, 0, 0 }; }
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo3 ()
> +{
> +  return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 }; }
> +
> +__m256i
> +__attribute__ ((noinline, noclone))
> +foo4 ()
> +{
> +  return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           -1, -1, -1, -1,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0,
> +                                           0, 0, 0, 0 }; }
> --
> 2.18.2
>

Others LGTM.

--
BR,
Hongtao
  
Hongtao Liu Sept. 23, 2022, 3:09 a.m. UTC | #3
On Fri, Sep 23, 2022 at 11:07 AM Hu, Lin1 <lin1.hu@intel.com> wrote:
>
> Hi, Hongtao
>
> I have modefied this patch and regtested on x86_64-pc-linux-gnu.
>
Ok.
> BRs.
> Lin
>
> -----Original Message-----
> From: Hongtao Liu <crazylht@gmail.com>
> Sent: Friday, September 23, 2022 9:48 AM
> To: Hu, Lin1 <lin1.hu@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao <hongtao.liu@intel.com>
> Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
>
> On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> >
> > Hi all,
> >
> > This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result.
> >
> > Regtested on x86_64-pc-linux-gnu. Ok for trunk?
> >
> > BRs,
> > Lin
> >
> > gcc/ChangeLog:
> >
> >         PR target/94962
> >         * config/i386/constraints.md (BH): New define_constraint.
> >         * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate.
> >         (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd".
> >         * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate.
> >         (vector_all_ones_zero_extend_quarter_operand): Ditto.
> >         * config/i386/sse.md: Add constraint to insn "mov<mode>_internal".
> (mov<mode>_internal): Add new constraint BH.
> Put the insn name at first.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/94962
> >         * gcc.target/i386/avx256-unaligned-load-1.c: Modify test.
> >         * gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
> >         * gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
> >         * gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
> >         * gcc.target/i386/pr94962-1.c: New test.
> >         * gcc.target/i386/pr94962-2.c: Ditto.
> >         * gcc.target/i386/pr94962-3.c: Ditto.
> >         * gcc.target/i386/pr94962-4.c: Ditto.
> > ---
> >  gcc/config/i386/constraints.md                |  8 +++
> >  gcc/config/i386/i386.cc                       | 26 +++++++-
> >  gcc/config/i386/predicates.md                 | 49 ++++++++++++++
> >  gcc/config/i386/sse.md                        |  8 +--
> >  .../gcc.target/i386/avx256-unaligned-load-1.c |  4 +-
> >  .../i386/avx256-unaligned-store-1.c           |  4 +-
> >  .../i386/avx256-unaligned-store-2.c           |  4 +-
> >  .../i386/avx256-unaligned-store-3.c           |  4 +-
> >  gcc/testsuite/gcc.target/i386/pr94962-1.c     | 11 ++++
> >  gcc/testsuite/gcc.target/i386/pr94962-2.c     | 17 +++++
> >  gcc/testsuite/gcc.target/i386/pr94962-3.c     | 64 +++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr94962-4.c     | 49 ++++++++++++++
> >  12 files changed, 235 insertions(+), 13 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c
> >
> > diff --git a/gcc/config/i386/constraints.md
> > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644
> > --- a/gcc/config/i386/constraints.md
> > +++ b/gcc/config/i386/constraints.md
> > @@ -168,6 +168,9 @@
> >  ;;  z  Constant call address operand.
> >  ;;  C  Integer SSE constant with all bits set operand.
> >  ;;  F  Floating-point SSE constant with all bits set operand.
> > +;;  H  Integer SSE constant that is 128/256bit all ones
> > +;;     and zero-extand to 256/512bit, or 128bit all ones
> > +;;     and zero-extend to 512bit.
> >  ;;  M  x86-64 memory operand.
> >
> >  (define_constraint "Bf"
> > @@ -233,6 +236,11 @@
> >    (and (match_test "TARGET_SSE")
> >         (match_operand 0 "float_vector_all_ones_operand")))
> >
> > +(define_constraint "BH"
> > +  "@internal integer constant with last half/quarter bits set operand."
> > +  (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
> > +       (match_operand 0
> > +"vector_all_ones_zero_extend_quarter_operand")))
> > +
> >  ;; NB: Similar to 'm', but don't use define_memory_constraint on
> > x86-64  ;; to prevent LRA from converting the operand to the form '(mem (reg X))'
> >  ;; where X is a base register.
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index
> > dadf453d6c0..ca799da5d7e 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx)
> >                                        XFmode);  }
> >
> > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1
> > +/* Return 1 if X is all bits 0, 2 if X is all bits 1
> > +   and 3 if X is all bits 1 with zero extend
> >     in supported SSE/AVX vector mode.  */
> >
> >  int
> > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
> >         }
> >      }
> >
> > +  if (vector_all_ones_zero_extend_half_operand (x, mode)
> > +      || vector_all_ones_zero_extend_quarter_operand (x, mode))
> > +    return 3;
> > +
> >    return 0;
> >  }
> >
> > @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
> >           gcc_unreachable ();
> >         }
> >     }
> > +  else if (vector_all_ones_zero_extend_half_operand (x, mode))
> > +    {
> > +      if (GET_MODE_SIZE (mode) == 64)
> > +       {
> > +         gcc_assert (TARGET_AVX512F);
> > +         return "vpcmpeqd \t %t0, %t0, %t0";
> > +       }
> > +      else if (GET_MODE_SIZE (mode) == 32)
> > +       {
> > +         gcc_assert (TARGET_AVX);
> > +         return "vpcmpeqd \t %x0, %x0, %x0";
> > +       }
> > +      gcc_unreachable ();
> > +    }
> > +  else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
> > +    {
> > +      gcc_assert (TARGET_AVX512F);
> > +      return "vpcmpeqd \t %x0, %x0, %x0";
> > +    }
> >
> Can we merge 2 vpcmpeqd \t %x0, %x0, %x0"; into 1?
> like
> else if (vector_all_ones_zero_extend_half_operand (x, mode)
>             && GET_MODE_SIZE(mode) == 64))
>   return "vpcmpeqd \t %t0, %t0, %t0";
> else if ((vector_all_ones_zero_extend_half_operand (x, mode)
>             && GET_MODE_SIZE (mode) == 32)
>            || vector_all_ones_zero_extend_quarter_operand (x, mode))
>     return "vpcmpeqd \t %x0, %x0, %x0";
>
> >    gcc_unreachable ();
> >  }
> > diff --git a/gcc/config/i386/predicates.md
> > b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -1159,6 +1159,55 @@
> >         (match_test "INTEGRAL_MODE_P (GET_MODE (op))")
> >         (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
> >
> > +/* Return true if operand is an 128/256bit all ones vector
> > +   that zero-extends to 256/512bit.  */ (define_predicate
> > +"vector_all_ones_zero_extend_half_operand"
> > +  (match_code "const_vector")
> > +{
> > +  mode = GET_MODE (op);
> > +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
> > +      || (GET_MODE_SIZE (mode) != 32
> > +         && GET_MODE_SIZE (mode) != 64))
> > +    return false;
> > +
> > +  int nelts = CONST_VECTOR_NUNITS (op);
> > +  for (int i = 0; i != nelts; i++)
> > +    {
> > +      rtx elt = CONST_VECTOR_ELT (op, i);
> > +      if (i < nelts / 2
> > +         && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
> > +       return false;
> > +      if (i >= nelts / 2
> > +         && elt != CONST0_RTX (GET_MODE_INNER (mode)))
> > +       return false;
> > +    }
> > +  return true;
> > +})
> > +
> > +/* Return true if operand is an 128bit all ones vector
> > +   that zero extends to 512bit.  */
> > +(define_predicate "vector_all_ones_zero_extend_quarter_operand"
> > +  (match_code "const_vector")
> > +{
> > +  mode = GET_MODE (op);
> > +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
> > +      || GET_MODE_SIZE (mode) != 64)
> > +    return false;
> > +
> > +  int nelts = CONST_VECTOR_NUNITS (op);
> > +  for (int i = 0; i != nelts; i++)
> > +    {
> > +      rtx elt = CONST_VECTOR_ELT (op, i);
> > +      if (i < nelts / 4
> > +         && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
> > +       return false;
> > +      if (i >= nelts / 4
> > +         && elt != CONST0_RTX (GET_MODE_INNER (mode)))
> > +       return false;
> > +    }
> > +  return true;
> > +})
> > +
> >  ; Return true when OP is operand acceptable for vector memory operand.
> >  ; Only AVX can have misaligned memory operand.
> >  (define_predicate "vector_memory_operand"
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> > d535c0af043..f804dbe9b7a 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1318,9 +1318,9 @@
> >
> >  (define_insn "mov<mode>_internal"
> >    [(set (match_operand:VMOVE 0 "nonimmediate_operand"
> > -        "=v,v ,v ,m")
> > +        "=v,v ,v,v ,m")
> >         (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
> > -        " C,<sseconstm1>,vm,v"))]
> > +        " C,<sseconstm1>,BH,vm,v"))]
> >    "TARGET_SSE
> >     && (register_operand (operands[0], <MODE>mode)
> >         || register_operand (operands[1], <MODE>mode)) @@ -1338,7
> > +1338,7 @@
> >        gcc_unreachable ();
> >      }
> >  }
> > -  [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
> > +  [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov")
> >     (set_attr "prefix" "maybe_vex")
> >     (set (attr "mode")
> >         (cond [(match_test "TARGET_AVX") @@ -1349,7 +1349,7 @@
> >                (and (match_test "<MODE>mode == V2DFmode")
> >                     (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
> >                  (const_string "V4SF")
> > -              (and (eq_attr "alternative" "3")
> > +              (and (eq_attr "alternative" "4")
> >                     (match_test "TARGET_SSE_TYPELESS_STORES"))
> >                  (const_string "V4SF")
> >                (and (eq_attr "alternative" "0") diff --git
> > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> > index 68378a556fb..7115b0a9dde 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
> > @@ -14,6 +14,6 @@ avx_test (void)
> >      c[i] = a[i] * b[i+3];
> >  }
> >
> > -/* { dg-final { scan-assembler-not
> > "vmovups\[^\n\r]*movv8sf_internal/2" } } */
> > -/* { dg-final { scan-assembler "movv4sf_internal/2" } } */
> > +/* { dg-final { scan-assembler-not
> > +"vmovups\[^\n\r]*movv8sf_internal/3" } } */
> > +/* { dg-final { scan-assembler "movv4sf_internal/3" } } */
> >  /* { dg-final { scan-assembler "vinsertf128" } } */ diff --git
> > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> > index d82aecffda9..4c713959df2 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
> > @@ -17,6 +17,6 @@ avx_test (void)
> >      d[i] = c[i] * 20.0;
> >  }
> >
> > -/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } }
> > */
> > -/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */
> > +/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } }
> > +*/
> > +/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */
> >  /* { dg-final { scan-assembler "vextractf128" } } */ diff --git
> > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> > index be12529e8d5..4978c37f526 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
> > @@ -23,6 +23,6 @@ avx_test (void)
> >      }
> >  }
> >
> > -/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } }
> > */
> > -/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */
> > +/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } }
> > +*/
> > +/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */
> >  /* { dg-final { scan-assembler "vextract.128" } } */ diff --git
> > a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> > b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> > index 918028df9ed..f909099bcb1 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
> > @@ -17,6 +17,6 @@ avx_test (void)
> >      d[i] = c[i] * 20.0;
> >  }
> >
> > -/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } }
> > */
> > -/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */
> > +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } }
> > +*/
> > +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */
> >  /* { dg-final { scan-assembler "vextractf128" } } */ diff --git
> > a/gcc/testsuite/gcc.target/i386/pr94962-1.c
> > b/gcc/testsuite/gcc.target/i386/pr94962-1.c
> > new file mode 100644
> > index 00000000000..e3b01249421
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx" } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1
> > +} } */
> > +
> > +#include <immintrin.h>
> > +
> > +__m256i mask()
> > +{
> > +  return _mm256_zextsi128_si256(_mm_set1_epi8(-1));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c
> > b/gcc/testsuite/gcc.target/i386/pr94962-2.c
> > new file mode 100644
> > index 00000000000..4e10e927ba1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1
> > +} } */
> > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1
> > +} } */
> > +
> > +#include <immintrin.h>
> > +
> > +__m512i mask1()
> > +{
> > +  return _mm512_zextsi128_si512(_mm_set1_epi8(-1));
> > +}
> > +
> > +__m512i mask2()
> > +{
> > +  return _mm512_zextsi256_si512(_mm256_set1_epi8(-1));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c
> > b/gcc/testsuite/gcc.target/i386/pr94962-3.c
> > new file mode 100644
> > index 00000000000..8d0b9974435
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c
> > @@ -0,0 +1,64 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512f" } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4
> > +} } */
> > +
> > +typedef long long __v8di __attribute__ ((__vector_size__ (64)));
> > +typedef int __v16si __attribute__ ((__vector_size__ (64))); typedef
> > +short __v32hi __attribute__ ((__vector_size__ (64))); typedef char
> > +__v64qi __attribute__ ((__vector_size__ (64))); typedef long long
> > +__m512i __attribute__ ((__vector_size__ (64), __may_alias__));
> > +
> > +__m512i
> > +__attribute__ ((noinline, noclone))
> > +foo1 ()
> > +{
> > +  return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1,
> > +                                          0, 0, 0, 0 }; }
> > +
> > +__m512i
> > +__attribute__ ((noinline, noclone))
> > +foo2 ()
> > +{
> > +  return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0 }; }
> > +
> > +__m512i
> > +__attribute__ ((noinline, noclone))
> > +foo3 ()
> > +{
> > +  return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0 }; }
> > +
> > +__m512i
> > +__attribute__ ((noinline, noclone))
> > +foo4 ()
> > +{
> > +  return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0 }; }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c
> > b/gcc/testsuite/gcc.target/i386/pr94962-4.c
> > new file mode 100644
> > index 00000000000..5502c39910b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c
> > @@ -0,0 +1,49 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx" } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4
> > +} } */
> > +
> > +typedef long long __v4di __attribute__ ((__vector_size__ (32)));
> > +typedef int __v8si __attribute__ ((__vector_size__ (32))); typedef
> > +short __v16hi __attribute__ ((__vector_size__ (32))); typedef char
> > +__v32qi __attribute__ ((__vector_size__ (32))); typedef long long
> > +__m256i __attribute__ ((__vector_size__ (32), __may_alias__));
> > +
> > +__m256i
> > +__attribute__ ((noinline, noclone))
> > +foo1 ()
> > +{
> > +  return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 }; }
> > +
> > +__m256i
> > +__attribute__ ((noinline, noclone))
> > +foo2 ()
> > +{
> > +  return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1,
> > +                                           0, 0, 0, 0 }; }
> > +
> > +__m256i
> > +__attribute__ ((noinline, noclone))
> > +foo3 ()
> > +{
> > +  return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0 }; }
> > +
> > +__m256i
> > +__attribute__ ((noinline, noclone))
> > +foo4 ()
> > +{
> > +  return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           -1, -1, -1, -1,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0,
> > +                                           0, 0, 0, 0 }; }
> > --
> > 2.18.2
> >
>
> Others LGTM.
>
> --
> BR,
> Hongtao
  

Patch

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7361687632f..95b2b142d41 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -168,6 +168,9 @@ 
 ;;  z  Constant call address operand.
 ;;  C  Integer SSE constant with all bits set operand.
 ;;  F  Floating-point SSE constant with all bits set operand.
+;;  H  Integer SSE constant that is 128/256bit all ones
+;;     and zero-extand to 256/512bit, or 128bit all ones
+;;     and zero-extend to 512bit.
 ;;  M  x86-64 memory operand.
 
 (define_constraint "Bf"
@@ -233,6 +236,11 @@ 
   (and (match_test "TARGET_SSE")
        (match_operand 0 "float_vector_all_ones_operand")))
 
+(define_constraint "BH"
+  "@internal integer constant with last half/quarter bits set operand."
+  (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
+       (match_operand 0 "vector_all_ones_zero_extend_quarter_operand")))
+
 ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64
 ;; to prevent LRA from converting the operand to the form '(mem (reg X))'
 ;; where X is a base register.
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dadf453d6c0..ca799da5d7e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -5186,7 +5186,8 @@  standard_80387_constant_rtx (int idx)
 				       XFmode);
 }
 
-/* Return 1 if X is all bits 0 and 2 if X is all bits 1
+/* Return 1 if X is all bits 0, 2 if X is all bits 1
+   and 3 if X is all bits 1 with zero extend
    in supported SSE/AVX vector mode.  */
 
 int
@@ -5234,6 +5235,10 @@  standard_sse_constant_p (rtx x, machine_mode pred_mode)
 	}
     }
 
+  if (vector_all_ones_zero_extend_half_operand (x, mode)
+      || vector_all_ones_zero_extend_quarter_operand (x, mode))
+    return 3;
+
   return 0;
 }
 
@@ -5341,6 +5346,25 @@  standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
 	  gcc_unreachable ();
 	}
    }
+  else if (vector_all_ones_zero_extend_half_operand (x, mode))
+    {
+      if (GET_MODE_SIZE (mode) == 64)
+	{
+	  gcc_assert (TARGET_AVX512F);
+	  return "vpcmpeqd \t %t0, %t0, %t0";
+	}
+      else if (GET_MODE_SIZE (mode) == 32)
+	{
+	  gcc_assert (TARGET_AVX);
+	  return "vpcmpeqd \t %x0, %x0, %x0";
+	}
+      gcc_unreachable ();
+    }
+  else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
+    {
+      gcc_assert (TARGET_AVX512F);
+      return "vpcmpeqd \t %x0, %x0, %x0";
+    }
 
   gcc_unreachable ();
 }
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 4f16bb748b5..655eabf793b 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1159,6 +1159,55 @@ 
        (match_test "INTEGRAL_MODE_P (GET_MODE (op))")
        (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is an 128/256bit all ones vector
+   that zero-extends to 256/512bit.  */
+(define_predicate "vector_all_ones_zero_extend_half_operand"
+  (match_code "const_vector")
+{
+  mode = GET_MODE (op);
+  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
+      || (GET_MODE_SIZE (mode) != 32
+	  && GET_MODE_SIZE (mode) != 64))
+    return false;
+
+  int nelts = CONST_VECTOR_NUNITS (op);
+  for (int i = 0; i != nelts; i++)
+    {
+      rtx elt = CONST_VECTOR_ELT (op, i);
+      if (i < nelts / 2
+	  && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
+	return false;
+      if (i >= nelts / 2
+	  && elt != CONST0_RTX (GET_MODE_INNER (mode)))
+	return false;
+    }
+  return true;
+})
+
+/* Return true if operand is an 128bit all ones vector
+   that zero extends to 512bit.  */
+(define_predicate "vector_all_ones_zero_extend_quarter_operand"
+  (match_code "const_vector")
+{
+  mode = GET_MODE (op);
+  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
+      || GET_MODE_SIZE (mode) != 64)
+    return false;
+
+  int nelts = CONST_VECTOR_NUNITS (op);
+  for (int i = 0; i != nelts; i++)
+    {
+      rtx elt = CONST_VECTOR_ELT (op, i);
+      if (i < nelts / 4
+	  && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
+	return false;
+      if (i >= nelts / 4
+	  && elt != CONST0_RTX (GET_MODE_INNER (mode)))
+	return false;
+    }
+  return true;
+})
+
 ; Return true when OP is operand acceptable for vector memory operand.
 ; Only AVX can have misaligned memory operand.
 (define_predicate "vector_memory_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d535c0af043..f804dbe9b7a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1318,9 +1318,9 @@ 
 
 (define_insn "mov<mode>_internal"
   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
-	 "=v,v ,v ,m")
+	 "=v,v ,v,v ,m")
 	(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
-	 " C,<sseconstm1>,vm,v"))]
+	 " C,<sseconstm1>,BH,vm,v"))]
   "TARGET_SSE
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[1], <MODE>mode))
@@ -1338,7 +1338,7 @@ 
       gcc_unreachable ();
     }
 }
-  [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
+  [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
 	(cond [(match_test "TARGET_AVX")
@@ -1349,7 +1349,7 @@ 
 	       (and (match_test "<MODE>mode == V2DFmode")
 		    (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
 		 (const_string "V4SF")
-	       (and (eq_attr "alternative" "3")
+	       (and (eq_attr "alternative" "4")
 		    (match_test "TARGET_SSE_TYPELESS_STORES"))
 		 (const_string "V4SF")
 	       (and (eq_attr "alternative" "0")
diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
index 68378a556fb..7115b0a9dde 100644
--- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c
@@ -14,6 +14,6 @@  avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/2" } } */
-/* { dg-final { scan-assembler "movv4sf_internal/2" } } */
+/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/3" } } */
+/* { dg-final { scan-assembler "movv4sf_internal/3" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
index d82aecffda9..4c713959df2 100644
--- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c
@@ -17,6 +17,6 @@  avx_test (void)
     d[i] = c[i] * 20.0;
 }
 
-/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } */
-/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */
+/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } */
+/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */
 /* { dg-final { scan-assembler "vextractf128" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
index be12529e8d5..4978c37f526 100644
--- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
@@ -23,6 +23,6 @@  avx_test (void)
     }
 }
 
-/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } */
-/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */
+/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } */
+/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */
 /* { dg-final { scan-assembler "vextract.128" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
index 918028df9ed..f909099bcb1 100644
--- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
@@ -17,6 +17,6 @@  avx_test (void)
     d[i] = c[i] * 20.0;
 }
 
-/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */
-/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */
+/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } */
+/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */
 /* { dg-final { scan-assembler "vextractf128" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr94962-1.c b/gcc/testsuite/gcc.target/i386/pr94962-1.c
new file mode 100644
index 00000000000..e3b01249421
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94962-1.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */
+
+#include <immintrin.h>
+
+__m256i mask()
+{
+  return _mm256_zextsi128_si256(_mm_set1_epi8(-1));
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr94962-2.c b/gcc/testsuite/gcc.target/i386/pr94962-2.c
new file mode 100644
index 00000000000..4e10e927ba1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94962-2.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 } } */
+
+#include <immintrin.h>
+
+__m512i mask1()
+{
+  return _mm512_zextsi128_si512(_mm_set1_epi8(-1));
+}
+
+__m512i mask2()
+{
+  return _mm512_zextsi256_si512(_mm256_set1_epi8(-1));
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr94962-3.c b/gcc/testsuite/gcc.target/i386/pr94962-3.c
new file mode 100644
index 00000000000..8d0b9974435
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94962-3.c
@@ -0,0 +1,64 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 } } */
+
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo1 ()
+{
+  return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1,
+					   0, 0, 0, 0 };
+}
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo2 ()
+{
+  return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0 };
+}
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo3 ()
+{
+  return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0 };
+}
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo4 ()
+{
+  return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0 };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr94962-4.c b/gcc/testsuite/gcc.target/i386/pr94962-4.c
new file mode 100644
index 00000000000..5502c39910b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94962-4.c
@@ -0,0 +1,49 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 } } */
+
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__));
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo1 ()
+{
+  return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 };
+}
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo2 ()
+{
+  return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1,
+					    0, 0, 0, 0 };
+}
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo3 ()
+{
+  return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0 };
+}
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo4 ()
+{
+  return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    -1, -1, -1, -1,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0,
+					    0, 0, 0, 0 };
+}