From patchwork Fri Oct 22 05:48:51 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: liuhongt X-Patchwork-Id: 46520 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id ED4363857817 for ; Fri, 22 Oct 2021 05:49:29 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org ED4363857817 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1634881770; bh=7iV8qntUWDvQeT9ZG+swhK+hEH/pXo59bnVcuk/9tiM=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=DFMOzGd+o7IAKrcuPBgqNT3GWzQTajGMxuYoHh4igeM9WpOpfTpAYKCxok61V1FhQ 5ZaLy9w4dcVRnqNLMBtNl9trMnVT0gQGQoVIXboQwlAu02c71dMiF0BiEVYf+c67sV l5XisMX2b7U0ARVYsUUwpVd5netTNQzV+/a32778= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from mga09.intel.com (mga09.intel.com [134.134.136.24]) by sourceware.org (Postfix) with ESMTPS id 6DEBA3858416 for ; Fri, 22 Oct 2021 05:48:57 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 6DEBA3858416 X-IronPort-AV: E=McAfee;i="6200,9189,10144"; a="229095086" X-IronPort-AV: E=Sophos;i="5.87,171,1631602800"; d="scan'208";a="229095086" Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by orsmga102.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 21 Oct 2021 22:48:55 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.87,171,1631602800"; d="scan'208";a="568832593" Received: from scymds01.sc.intel.com ([10.148.94.138]) by FMSMGA003.fm.intel.com with ESMTP; 21 Oct 2021 22:48:53 -0700 Received: from shliclel051.sh.intel.com (shliclel051.sh.intel.com [10.239.236.51]) by scymds01.sc.intel.com with ESMTP id 19M5mq5G021206; Thu, 21 Oct 2021 22:48:52 -0700 To: gcc-patches@gcc.gnu.org Subject: [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask. Date: Fri, 22 Oct 2021 13:48:51 +0800 Message-Id: <20211022054851.1045-1-hongtao.liu@intel.com> X-Mailer: git-send-email 2.18.1 In-Reply-To: References: X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_NONE, KAM_DMARC_STATUS, KAM_LAZY_DOMAIN_SECURITY, KAM_SHORT, RCVD_IN_DNSWL_LOW, RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SPF_HELO_NONE, SPF_NONE, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: liuhongt via Gcc-patches From: liuhongt Reply-To: liuhongt Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" Hi: This patch is try to canoicalize bit_and and nop_convert order for __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_*,__sync_fetch_and_or_*, __sync_fetch_and_xor_*,__sync_xor_and_fetch_*, __atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant. .i.e. +/* Canonicalize + _1 = __atomic_fetch_or_4 (&v, 1, 0); + _2 = (int) _1; + _5 = _2 & 1; + +to + + _1 = __atomic_fetch_or_4 (&v, 1, 0); + _2 = _1 & 1; + _5 = (int) _2; +/* Convert + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0); + _2 = (int) _1; + _3 = _2 & 8192; +to + _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0); + _7 = _1 & 8192; + _6 = (int) _7; + So it can be handled by optimize_atomic_bit_test_and. */ I'm trying to rewrite match part in match.pd and find the canonicalization is ok when mask is constant, but not for variable since it will be simplified back by /* In GIMPLE, getting rid of 2 conversions for one new results in smaller IL. */ (simplify (convert (bitop:cs@2 (nop_convert:s @0) @1)) (if (GIMPLE && TREE_CODE (@1) != INTEGER_CST && tree_nop_conversion_p (type, TREE_TYPE (@2)) && types_match (type, @0)) (bitop @0 (convert @1))))) The canonicalization for variabled is like convert _1 = ~mask_7; _2 = (unsigned int) _1; _3 = __atomic_fetch_and_4 (ptr_6, _2, 0); _4 = (int) _3; _5 = _4 & mask_7; to _1 = ~mask_7; _2 = (unsigned int) _1; _3 = __atomic_fetch_and_4 (ptr_6, _2, 0); _4 = (unsigned int) mask_7 _6 = _3 & _4 _5 = (int) _6 and be simplified back. I've also tried another way of simplication like convert _1 = ~mask_7; _2 = (unsigned int) _1; _3 = __atomic_fetch_and_4 (ptr_6, _2, 0); _4 = (int) _3; _5 = _4 & mask_7; to _1 = (unsigned int)mask_7; _2 = ~ _1; _3 = __atomic_fetch_and_4 (ptr_6, _2, 0); _6 = _3 & _1 _5 = (int) but it's prevent by below since __atomic_fetch_and_4 is not CONST, but we need to regenerate it with updated parameter. /* We can't and should not emit calls to non-const functions. */ if (!(flags_from_decl_or_type (decl) & ECF_CONST)) return NULL; Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: * match.pd: Canonicalize bit_and and nop_convert order for __atomic/sync_fetch_or/xor/and for when mask is constant. gcc/testsuite/ChangeLog: * gcc.target/i386/pr102566-1a.c: New test. * gcc.target/i386/pr102566-2a.c: New test. --- gcc/match.pd | 118 ++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr102566-1a.c | 66 +++++++++++ gcc/testsuite/gcc.target/i386/pr102566-2a.c | 65 +++++++++++ 3 files changed, 249 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c diff --git a/gcc/match.pd b/gcc/match.pd index 5bed2e12715..06b369d1ab1 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (define_operator_list COND_TERNARY IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS) +/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_* */ +(define_operator_list ATOMIC_FETCH_OR_XOR_N + BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2 + BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8 + BUILT_IN_ATOMIC_FETCH_OR_16 + BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2 + BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8 + BUILT_IN_ATOMIC_FETCH_XOR_16 + BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2 + BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8 + BUILT_IN_ATOMIC_XOR_FETCH_16) +/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_* */ +(define_operator_list SYNC_FETCH_OR_XOR_N + BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2 + BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8 + BUILT_IN_SYNC_FETCH_AND_OR_16 + BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2 + BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8 + BUILT_IN_SYNC_FETCH_AND_XOR_16 + BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2 + BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8 + BUILT_IN_SYNC_XOR_AND_FETCH_16) +/* __atomic_fetch_and_*. */ +(define_operator_list ATOMIC_FETCH_AND_N + BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2 + BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8 + BUILT_IN_ATOMIC_FETCH_AND_16) +/* __sync_fetch_and_and_*. */ +(define_operator_list SYNC_FETCH_AND_AND_N + BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2 + BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8 + BUILT_IN_SYNC_FETCH_AND_AND_16) + /* With nop_convert? combine convert? and view_convert? in one pattern plus conditionalize on tree_nop_conversion_p conversions. */ (match (nop_convert @0) @@ -3907,6 +3940,91 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (vec_cond @0 (op! @3 @1) (op! @3 @2)))) #endif +#if GIMPLE +/* Canonicalize + _1 = __atomic_fetch_or_4 (&v, 1, 0); + _2 = (int) _1; + _5 = _2 & 1; + +to + + _1 = __atomic_fetch_or_4 (&v, 1, 0); + _2 = _1 & 1; + _5 = (int) _2; + + So it can be handled by optimize_atomic_bit_test_and. */ +(simplify + (bit_and + (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2)) + INTEGER_CST@4) + (with { int ibit = tree_log2 (@1); + int ibit2 = tree_log2 (@4); } + (if (ibit >= 0 && ibit == ibit2 + && single_use (@5)) + /* Make sure the second operand have the same type as @3 + orelse will hit gcc_asssert. */ + (convert:type + (bit_and @3 + { build_int_cst (TREE_TYPE (@3), + HOST_WIDE_INT_1U << ibit);}))))) + +(simplify + (bit_and + (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1)) + INTEGER_CST@2) + (with { int ibit = tree_log2 (@1); + int ibit2 = tree_log2 (@2); } + (if (ibit >= 0 && ibit == ibit2 + && single_use (@4)) + /* Make sure the second operand have the same type as @3 + orelse will hit gcc_asssert. */ + (convert:type + (bit_and @3 + { build_int_cst (TREE_TYPE (@3), + HOST_WIDE_INT_1U << ibit);}))))) +/* Convert + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0); + _2 = (int) _1; + _3 = _2 & 8192; +to + _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0); + _7 = _1 & 8192; + _6 = (int) _7; + So it can be handled by optimize_atomic_bit_test_and. */ + +(simplify + (bit_and + (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2)) + INTEGER_CST@4) + (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)), + TYPE_PRECISION(type))); + int ibit2 = tree_log2 (@4); } + (if (ibit >= 0 && ibit == ibit2 + && single_use (@5)) + /* Make sure the second operand have the same type as @3 + orelse will hit gcc_asssert. */ + (convert:type + (bit_and @3 + { build_int_cst (TREE_TYPE (@3), + HOST_WIDE_INT_1U << ibit);}))))) + +(simplify + (bit_and + (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1)) + INTEGER_CST@2) + (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)), + TYPE_PRECISION(type))); + int ibit2 = tree_log2 (@2); } + (if (ibit >= 0 && ibit == ibit2 + && single_use (@4)) + /* Make sure the second operand have the same type as @3 + orelse will hit gcc_asssert. */ + (convert:type + (bit_and @3 + { build_int_cst (TREE_TYPE (@3), + HOST_WIDE_INT_1U << ibit);}))))) +#endif + /* (v ? w : 0) ? a : b is just (v & w) ? a : b Currently disabled after pass lvec because ARM understands VEC_COND_EXPR but not a plain v==w fed to BIT_IOR_EXPR. */ diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c new file mode 100644 index 00000000000..2657a2f62ae --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +#include +#include + +#define FOO(TYPE,MASK) \ + __attribute__((noinline,noclone)) TYPE \ + atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __sync_fetch_and_or (a, mask) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __sync_fetch_and_xor (a, mask) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __sync_xor_and_fetch (a, mask) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1 << MASK; \ + return __sync_fetch_and_and (a, ~mask) & mask; \ + } \ + +FOO(short, 0); +FOO(short, 7); +FOO(short, 15); +FOO(int, 0); +FOO(int, 15); +FOO(int, 31); + +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */ +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */ +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */ +/* { dg-final { scan-assembler-not "cmpxchg" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2a.c b/gcc/testsuite/gcc.target/i386/pr102566-2a.c new file mode 100644 index 00000000000..24681c1da18 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102566-2a.c @@ -0,0 +1,65 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ +#include +#include +typedef long long int64; + +#define FOO(TYPE,MASK) \ + __attribute__((noinline,noclone)) TYPE \ + atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __sync_fetch_and_or (a, mask) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __sync_fetch_and_xor (a, mask) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __sync_xor_and_fetch (a, mask) & mask; \ + } \ + __attribute__((noinline,noclone)) TYPE \ + sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a) \ + { \ + TYPE mask = 1ll << MASK; \ + return __sync_fetch_and_and (a, ~mask) & mask; \ + } \ + + +FOO(int64, 0); +FOO(int64, 32); +FOO(int64, 63); + +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */ +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */ +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */ +/* { dg-final { scan-assembler-not "cmpxchg" } } */