[i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}.

Message ID 20211221062659.102153-1-hongtao.liu@intel.com
State New
Headers
Series [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}. |

Commit Message

Liu, Hongtao Dec. 21, 2021, 6:26 a.m. UTC
  The purpose of those define_insn_and_split:
1. Combine vpcmpuw and zero_extend into vpcmpuw.
2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov
3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.

It should partially fix the issue in PR.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

	PR target/103750
	* config/i386/sse.md
	(*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New define_insn_and_split.
	(*<avx512>_cmp<mode>3): Ditto.
	(*<avx512>_cmp<mode>3_zero_extenddi): New define_insn.
	(*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New define_insn_and_split.
	(*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	Ditto.
	(*<avx512>_ucmp<mode>3): Ditto.
	(*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn.
	(*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
	New define_insn_and_split.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/bitwise_mask_op-3.c: Adjust test/
	* g++.target/i386/pr103750-1.C: New test.
---
 gcc/config/i386/sse.md                        | 267 ++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr103750-1.C    |  50 ++++
 .../gcc.target/i386/bitwise_mask_op-3.c       |   6 +-
 3 files changed, 320 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C
  

Comments

Hongtao Liu Dec. 23, 2021, 5:41 a.m. UTC | #1
On Tue, Dec 21, 2021 at 2:27 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> The purpose of those define_insn_and_split:
> 1. Combine vpcmpuw and zero_extend into vpcmpuw.
> 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov
> 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.
Use DImode as dest of zero_extend is too aggressive which causes
several regression.
New patch add define_insn_and_split just combine  vpcmpuw and
zero_extend into vpcmpuw.
Here's the patch i'm checking in.
>
> It should partially fix the issue in PR.
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.
>
> gcc/ChangeLog:
>
>         PR target/103750
>         * config/i386/sse.md
>         (*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         New define_insn_and_split.
>         (*<avx512>_cmp<mode>3): Ditto.
>         (*<avx512>_cmp<mode>3_zero_extenddi): New define_insn.
>         (*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         New define_insn_and_split.
>         (*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         Ditto.
>         (*<avx512>_ucmp<mode>3): Ditto.
>         (*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn.
>         (*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
>         New define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/
>         * g++.target/i386/pr103750-1.C: New test.
> ---
>  gcc/config/i386/sse.md                        | 267 ++++++++++++++++++
>  gcc/testsuite/g++.target/i386/pr103750-1.C    |  50 ++++
>  .../gcc.target/i386/bitwise_mask_op-3.c       |   6 +-
>  3 files changed, 320 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 5196149ee32..fb885d58272 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom
> +;; duplicated vpcmpuw to vpcmpuw and kmov
> +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg.
> +(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<V48H_AVX512VL:avx512fmaskmode>
> +           [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> +            (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
> +       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<V48H_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_cmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> +          (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> +            (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64"
> +  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_insn_and_split "*<avx512>_cmp<mode>3"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
>         (not:<avx512fmaskmode>
> @@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
> +       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +          UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_cmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +          (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_int_iterator UNSPEC_PCMP_ITER
>    [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
>
> @@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "const_0_to_7_operand" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
> +      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<VI12_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_ucmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +          (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_UNSIGNED_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
>         (unspec:<avx512fmaskmode>
> @@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> +  [(set (match_operand:SWI248x 0 "register_operand" "=k")
> +       (zero_extend:SWI248x
> +         (unspec:<VI48_AVX512VL:avx512fmaskmode>
> +           [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "const_0_to_7_operand" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +  && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
> +      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  "&& <SWI248x:MODE>mode != E_DImode"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<VI48_AVX512VL:avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +          UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_ucmp<mode>3"
> +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> +       (unspec:<avx512fmaskmode>
> +         [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> +          (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> +          (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +         UNSPEC_UNSIGNED_PCMP))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_dup 1)
> +            (match_dup 2)
> +            (match_dup 3)]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +       (zero_extend:DI
> +         (unspec:<avx512fmaskmode>
> +           [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> +            (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> +            (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> +           UNSPEC_UNSIGNED_PCMP)))]
> +  "TARGET_AVX512BW
> +   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
> +  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  [(set_attr "type" "ssecmp")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_insn_and_split "*<avx512>_ucmp<mode>3"
>    [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
>         (not:<avx512fmaskmode>
> diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C
> new file mode 100644
> index 00000000000..83f471331b3
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C
> @@ -0,0 +1,50 @@
> +/* PR target/103750 */
> +/* { dg-do compile }  */
> +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */
> +/* { dg-final { scan-assembler-times "kmovw" 2 } } */
> +/* { dg-final { scan-assembler-times "kmovd" 2 } } */
> +/* There shouldn't be any kmovw/kmovd inside the loop.  */
> +#include <immintrin.h>
> +
> +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> +  __m256i mch256 = _mm256_set1_epi16(c);
> +  for ( ; n < e; n += 32) {
> +    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> +    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> +    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> +    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> +    if (_kortestz_mask16_u8(mask1, mask2))
> +      continue;
> +
> +    unsigned idx = _tzcnt_u32(mask1);
> +    if (mask1 == 0) {
> +      idx = __tzcnt_u16(mask2);
> +      n += 16;
> +    }
> +    return n + idx;
> +  }
> +  return e;
> +}
> +
> +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> +  __m256i mch256 = _mm256_set1_epi16(c);
> +  for ( ; n < e; n += 32) {
> +    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> +    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> +    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> +    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> +    if (_kortestz_mask32_u8(mask1, mask2))
> +      continue;
> +
> +    unsigned idx = _tzcnt_u32(mask1);
> +    if (mask1 == 0) {
> +      idx = __tzcnt_u16(mask2);
> +      n += 16;
> +    }
> +    return n + idx;
> +  }
> +  return e;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> index 352c49d6c6b..82bb99e30af 100644
> --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b)
>    foo = m1 | m2;
>  }
>
> -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } }  */
> +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
>
>  void
>  foo_xorb (__m512i a, __m512i b)
> @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b)
>    foo = m1 ^ m2;
>  }
>
> -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } }  */
> +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
>
>  void
>  foo_andb (__m512i a, __m512i b)
> @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b)
>    foo = m1 & ~m2;
>  }
>
> -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } }  */
> +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } }  */
> --
> 2.18.1
>
  

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5196149ee32..fb885d58272 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3702,6 +3702,75 @@  (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom
+;; duplicated vpcmpuw to vpcmpuw and kmov
+;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg.
+(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
+       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+	   (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64"
+  "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn_and_split "*<avx512>_cmp<mode>3"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(not:<avx512fmaskmode>
@@ -3735,6 +3804,72 @@  (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+       < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	   UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_int_iterator UNSPEC_PCMP_ITER
   [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
 
@@ -3771,6 +3906,72 @@  (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+  && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<VI12_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(unspec:<avx512fmaskmode>
@@ -3785,6 +3986,72 @@  (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
+	(zero_extend:SWI248x
+	  (unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+  && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
+      < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "&& <SWI248x:MODE>mode != E_DImode"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<VI48_AVX512VL:avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	   UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_dup 1)
+	     (match_dup 2)
+	     (match_dup 3)]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+	(zero_extend:DI
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+	     (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+	     (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	    UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512BW
+   && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn_and_split "*<avx512>_ucmp<mode>3"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(not:<avx512fmaskmode>
diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C
new file mode 100644
index 00000000000..83f471331b3
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr103750-1.C
@@ -0,0 +1,50 @@ 
+/* PR target/103750 */
+/* { dg-do compile }  */
+/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */
+/* { dg-final { scan-assembler-times "kmovw" 2 } } */
+/* { dg-final { scan-assembler-times "kmovd" 2 } } */
+/* There shouldn't be any kmovw/kmovd inside the loop.  */
+#include <immintrin.h>
+
+const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+  __m256i mch256 = _mm256_set1_epi16(c);
+  for ( ; n < e; n += 32) {
+    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+    if (_kortestz_mask16_u8(mask1, mask2))
+      continue;
+
+    unsigned idx = _tzcnt_u32(mask1);
+    if (mask1 == 0) {
+      idx = __tzcnt_u16(mask2);
+      n += 16;
+    }
+    return n + idx;
+  }
+  return e;
+}
+
+const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+  __m256i mch256 = _mm256_set1_epi16(c);
+  for ( ; n < e; n += 32) {
+    __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+    __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+    __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+    __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+    if (_kortestz_mask32_u8(mask1, mask2))
+      continue;
+
+    unsigned idx = _tzcnt_u32(mask1);
+    if (mask1 == 0) {
+      idx = __tzcnt_u16(mask2);
+      n += 16;
+    }
+    return n + idx;
+  }
+  return e;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
index 352c49d6c6b..82bb99e30af 100644
--- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
@@ -12,7 +12,7 @@  foo_orb (__m512i a, __m512i b)
   foo = m1 | m2;
 }
 
-/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } }  */
+/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
 
 void
 foo_xorb (__m512i a, __m512i b)
@@ -22,7 +22,7 @@  foo_xorb (__m512i a, __m512i b)
   foo = m1 ^ m2;
 }
 
-/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } }  */
+/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } }  */
 
 void
 foo_andb (__m512i a, __m512i b)
@@ -40,4 +40,4 @@  foo_andnb (__m512i a, __m512i b)
   foo = m1 & ~m2;
 }
 
-/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } }  */
+/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } }  */