[i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}.
Commit Message
The purpose of those define_insn_and_split:
1. Combine vpcmpuw and zero_extend into vpcmpuw.
2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov
3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.
It should partially fix the issue in PR.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.
gcc/ChangeLog:
PR target/103750
* config/i386/sse.md
(*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
New define_insn_and_split.
(*<avx512>_cmp<mode>3): Ditto.
(*<avx512>_cmp<mode>3_zero_extenddi): New define_insn.
(*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
New define_insn_and_split.
(*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
Ditto.
(*<avx512>_ucmp<mode>3): Ditto.
(*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn.
(*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
New define_insn_and_split.
gcc/testsuite/ChangeLog:
* gcc.target/i386/bitwise_mask_op-3.c: Adjust test/
* g++.target/i386/pr103750-1.C: New test.
---
gcc/config/i386/sse.md | 267 ++++++++++++++++++
gcc/testsuite/g++.target/i386/pr103750-1.C | 50 ++++
.../gcc.target/i386/bitwise_mask_op-3.c | 6 +-
3 files changed, 320 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C
Comments
On Tue, Dec 21, 2021 at 2:27 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> The purpose of those define_insn_and_split:
> 1. Combine vpcmpuw and zero_extend into vpcmpuw.
> 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov
> 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.
Use DImode as dest of zero_extend is too aggressive which causes
several regression.
New patch add define_insn_and_split just combine vpcmpuw and
zero_extend into vpcmpuw.
Here's the patch i'm checking in.
>
> It should partially fix the issue in PR.
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.
>
> gcc/ChangeLog:
>
> PR target/103750
> * config/i386/sse.md
> (*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
> New define_insn_and_split.
> (*<avx512>_cmp<mode>3): Ditto.
> (*<avx512>_cmp<mode>3_zero_extenddi): New define_insn.
> (*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
> New define_insn_and_split.
> (*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
> Ditto.
> (*<avx512>_ucmp<mode>3): Ditto.
> (*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn.
> (*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>):
> New define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/
> * g++.target/i386/pr103750-1.C: New test.
> ---
> gcc/config/i386/sse.md | 267 ++++++++++++++++++
> gcc/testsuite/g++.target/i386/pr103750-1.C | 50 ++++
> .../gcc.target/i386/bitwise_mask_op-3.c | 6 +-
> 3 files changed, 320 insertions(+), 3 deletions(-)
> create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 5196149ee32..fb885d58272 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
> (set_attr "prefix" "evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom
> +;; duplicated vpcmpuw to vpcmpuw and kmov
> +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg.
> +(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> + [(set (match_operand:SWI248x 0 "register_operand" "=k")
> + (zero_extend:SWI248x
> + (unspec:<V48H_AVX512VL:avx512fmaskmode>
> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")]
> + UNSPEC_PCMP)))]
> + "TARGET_AVX512BW
> + && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
> + < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + "&& <SWI248x:MODE>mode != E_DImode"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<V48H_AVX512VL:avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_cmp<mode>3"
> + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> + (unspec:<avx512fmaskmode>
> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_PCMP))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<MODE>mode) < 64"
> + "#"
> + "&& 1"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
> + [(set (match_operand:DI 0 "register_operand" "=k")
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
> + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_PCMP)))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64"
> + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> (define_insn_and_split "*<avx512>_cmp<mode>3"
> [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
> (not:<avx512fmaskmode>
> @@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
> (set_attr "prefix" "evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> + [(set (match_operand:SWI248x 0 "register_operand" "=k")
> + (zero_extend:SWI248x
> + (unspec:<VI12_AVX512VL:avx512fmaskmode>
> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")]
> + UNSPEC_PCMP)))]
> + "TARGET_AVX512BW
> + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
> + < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + "&& <SWI248x:MODE>mode != E_DImode"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<VI12_AVX512VL:avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_cmp<mode>3"
> + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> + (unspec:<avx512fmaskmode>
> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_PCMP))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> + "#"
> + "&& 1"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
> + [(set (match_operand:DI 0 "register_operand" "=k")
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_PCMP)))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> (define_int_iterator UNSPEC_PCMP_ITER
> [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
>
> @@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
> (set_attr "prefix" "evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> + [(set (match_operand:SWI248x 0 "register_operand" "=k")
> + (zero_extend:SWI248x
> + (unspec:<VI12_AVX512VL:avx512fmaskmode>
> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "const_0_to_7_operand" "n")]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "TARGET_AVX512BW
> + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
> + < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + "&& <SWI248x:MODE>mode != E_DImode"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<VI12_AVX512VL:avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_ucmp<mode>3"
> + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> + (unspec:<avx512fmaskmode>
> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_UNSIGNED_PCMP))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> + "#"
> + "&& 1"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
> + [(set (match_operand:DI 0 "register_operand" "=k")
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
> + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
> [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> (unspec:<avx512fmaskmode>
> @@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
> (set_attr "prefix" "evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
> + [(set (match_operand:SWI248x 0 "register_operand" "=k")
> + (zero_extend:SWI248x
> + (unspec:<VI48_AVX512VL:avx512fmaskmode>
> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "const_0_to_7_operand" "n")]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "TARGET_AVX512BW
> + && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
> + < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
> + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + "&& <SWI248x:MODE>mode != E_DImode"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<VI48_AVX512VL:avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
> +
> +(define_insn_and_split "*<avx512>_ucmp<mode>3"
> + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
> + (unspec:<avx512fmaskmode>
> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_UNSIGNED_PCMP))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
> + "#"
> + "&& 1"
> + [(set (match_dup 0)
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_dup 1)
> + (match_dup 2)
> + (match_dup 3)]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
> + [(set (match_operand:DI 0 "register_operand" "=k")
> + (zero_extend:DI
> + (unspec:<avx512fmaskmode>
> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
> + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
> + (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
> + UNSPEC_UNSIGNED_PCMP)))]
> + "TARGET_AVX512BW
> + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
> + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + [(set_attr "type" "ssecmp")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> (define_insn_and_split "*<avx512>_ucmp<mode>3"
> [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
> (not:<avx512fmaskmode>
> diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C
> new file mode 100644
> index 00000000000..83f471331b3
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C
> @@ -0,0 +1,50 @@
> +/* PR target/103750 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */
> +/* { dg-final { scan-assembler-times "kmovw" 2 } } */
> +/* { dg-final { scan-assembler-times "kmovd" 2 } } */
> +/* There shouldn't be any kmovw/kmovd inside the loop. */
> +#include <immintrin.h>
> +
> +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> + __m256i mch256 = _mm256_set1_epi16(c);
> + for ( ; n < e; n += 32) {
> + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> + if (_kortestz_mask16_u8(mask1, mask2))
> + continue;
> +
> + unsigned idx = _tzcnt_u32(mask1);
> + if (mask1 == 0) {
> + idx = __tzcnt_u16(mask2);
> + n += 16;
> + }
> + return n + idx;
> + }
> + return e;
> +}
> +
> +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> + __m256i mch256 = _mm256_set1_epi16(c);
> + for ( ; n < e; n += 32) {
> + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> + if (_kortestz_mask32_u8(mask1, mask2))
> + continue;
> +
> + unsigned idx = _tzcnt_u32(mask1);
> + if (mask1 == 0) {
> + idx = __tzcnt_u16(mask2);
> + n += 16;
> + }
> + return n + idx;
> + }
> + return e;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> index 352c49d6c6b..82bb99e30af 100644
> --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b)
> foo = m1 | m2;
> }
>
> -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */
>
> void
> foo_xorb (__m512i a, __m512i b)
> @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b)
> foo = m1 ^ m2;
> }
>
> -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */
>
> void
> foo_andb (__m512i a, __m512i b)
> @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b)
> foo = m1 & ~m2;
> }
>
> -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } } */
> --
> 2.18.1
>
@@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom
+;; duplicated vpcmpuw to vpcmpuw and kmov
+;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg.
+(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+ [(set (match_operand:SWI248x 0 "register_operand" "=k")
+ (zero_extend:SWI248x
+ (unspec:<V48H_AVX512VL:avx512fmaskmode>
+ [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+ (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512BW
+ && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode)
+ < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+ "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ "&& <SWI248x:MODE>mode != E_DImode"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<V48H_AVX512VL:avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+ (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<MODE>mode) < 64"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
+ [(set (match_operand:DI 0 "register_operand" "=k")
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_operand:V48H_AVX512VL 1 "register_operand" "v")
+ (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64"
+ "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_insn_and_split "*<avx512>_cmp<mode>3"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand")
(not:<avx512fmaskmode>
@@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+ [(set (match_operand:SWI248x 0 "register_operand" "=k")
+ (zero_extend:SWI248x
+ (unspec:<VI12_AVX512VL:avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512BW
+ && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+ < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+ "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ "&& <SWI248x:MODE>mode != E_DImode"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<VI12_AVX512VL:avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_cmp<mode>3_zero_extenddi"
+ [(set (match_operand:DI 0 "register_operand" "=k")
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+ "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_int_iterator UNSPEC_PCMP_ITER
[UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
@@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+ [(set (match_operand:SWI248x 0 "register_operand" "=k")
+ (zero_extend:SWI248x
+ (unspec:<VI12_AVX512VL:avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "const_0_to_7_operand" "n")]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "TARGET_AVX512BW
+ && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode)
+ < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+ "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ "&& <SWI248x:MODE>mode != E_DImode"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<VI12_AVX512VL:avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_UNSIGNED_PCMP))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
+ [(set (match_operand:DI 0 "register_operand" "=k")
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64"
+ "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(unspec:<avx512fmaskmode>
@@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>"
+ [(set (match_operand:SWI248x 0 "register_operand" "=k")
+ (zero_extend:SWI248x
+ (unspec:<VI48_AVX512VL:avx512fmaskmode>
+ [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "const_0_to_7_operand" "n")]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "TARGET_AVX512BW
+ && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode)
+ < GET_MODE_PRECISION (<SWI248x:MODE>mode))"
+ "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ "&& <SWI248x:MODE>mode != E_DImode"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<VI48_AVX512VL:avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")])
+
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_UNSIGNED_PCMP))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 3)]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi"
+ [(set (match_operand:DI 0 "register_operand" "=k")
+ (zero_extend:DI
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+ (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "TARGET_AVX512BW
+ && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64"
+ "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_insn_and_split "*<avx512>_ucmp<mode>3"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand")
(not:<avx512fmaskmode>
new file mode 100644
@@ -0,0 +1,50 @@
+/* PR target/103750 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */
+/* { dg-final { scan-assembler-times "kmovw" 2 } } */
+/* { dg-final { scan-assembler-times "kmovd" 2 } } */
+/* There shouldn't be any kmovw/kmovd inside the loop. */
+#include <immintrin.h>
+
+const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+ __m256i mch256 = _mm256_set1_epi16(c);
+ for ( ; n < e; n += 32) {
+ __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+ __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+ __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+ __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+ if (_kortestz_mask16_u8(mask1, mask2))
+ continue;
+
+ unsigned idx = _tzcnt_u32(mask1);
+ if (mask1 == 0) {
+ idx = __tzcnt_u16(mask2);
+ n += 16;
+ }
+ return n + idx;
+ }
+ return e;
+}
+
+const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+ __m256i mch256 = _mm256_set1_epi16(c);
+ for ( ; n < e; n += 32) {
+ __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+ __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+ __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+ __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+ if (_kortestz_mask32_u8(mask1, mask2))
+ continue;
+
+ unsigned idx = _tzcnt_u32(mask1);
+ if (mask1 == 0) {
+ idx = __tzcnt_u16(mask2);
+ n += 16;
+ }
+ return n + idx;
+ }
+ return e;
+}
+
@@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b)
foo = m1 | m2;
}
-/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */
void
foo_xorb (__m512i a, __m512i b)
@@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b)
foo = m1 ^ m2;
}
-/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */
void
foo_andb (__m512i a, __m512i b)
@@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b)
foo = m1 & ~m2;
}
-/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } } */