Canonicalize __atomic/sync_fetch_or/xor/and for constant mask.
Commit Message
Canoicalize & and nop_convert order for
__atomic_fetch_or_*, __atomic_fetch_xor_*,
__atomic_xor_fetch_*,__sync_fetch_and_or_*,
__sync_fetch_and_xor_*,__sync_xor_and_fetch_*,
__atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant.
.i.e.
+/* Canonicalize
+ _1 = __atomic_fetch_or_4 (&v, 1, 0);
+ _2 = (int) _1;
+ _5 = _2 & 1;
+
+to
+
+ _1 = __atomic_fetch_or_4 (&v, 1, 0);
+ _2 = _1 & 1;
+ _5 = (int) _2;
+/* Convert
+ _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
+ _2 = (int) _1;
+ _3 = _2 & 8192;
+to
+ _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
+ _7 = _1 & 8192;
+ _6 = (int) _7;
+ So it can be handled by optimize_atomic_bit_test_and. */
I'm trying to rewrite match part in match.pd and find the
canonicalization is ok when mask is constant, but not for variable
since it will be simplified back by
/* In GIMPLE, getting rid of 2 conversions for one new results
in smaller IL. */
(simplify
(convert (bitop:cs@2 (nop_convert:s @0) @1))
(if (GIMPLE
&& TREE_CODE (@1) != INTEGER_CST
&& tree_nop_conversion_p (type, TREE_TYPE (@2))
&& types_match (type, @0))
(bitop @0 (convert @1)))))
The canonicalization for variabled is like
convert
_1 = ~mask_7;
_2 = (unsigned int) _1;
_3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
_4 = (int) _3;
_5 = _4 & mask_7;
to
_1 = ~mask_7;
_2 = (unsigned int) _1;
_3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
_4 = (unsigned int) mask_7
_6 = _3 & _4
_5 = (int) _6
and be simplified back.
I've also tried another way of simplication like
convert
_1 = ~mask_7;
_2 = (unsigned int) _1;
_3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
_4 = (int) _3;
_5 = _4 & mask_7;
to
_1 = (unsigned int)mask_7;
_2 = ~ _1;
_3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
_6 = _3 & _1
_5 = (int)
but it's prevent by below since __atomic_fetch_and_4 is not CONST, but
we need to regenerate it with updated parameter.
/* We can't and should not emit calls to non-const functions. */
if (!(flags_from_decl_or_type (decl) & ECF_CONST))
return NULL;
gcc/ChangeLog:
* match.pd: Canonicalize __atomic/sync_fetch_or/xor/and for
constant mask.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr102566-1a.c: New test.
* gcc.target/i386/pr102566-2a.c: New test.
---
gcc/match.pd | 114 ++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr102566-1a.c | 66 ++++++++++++
gcc/testsuite/gcc.target/i386/pr102566-2a.c | 65 +++++++++++
3 files changed, 245 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c
Comments
On Mon, Oct 25, 2021 at 1:59 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> Canoicalize & and nop_convert order for
> __atomic_fetch_or_*, __atomic_fetch_xor_*,
> __atomic_xor_fetch_*,__sync_fetch_and_or_*,
> __sync_fetch_and_xor_*,__sync_xor_and_fetch_*,
> __atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant.
>
> .i.e.
>
> +/* Canonicalize
> + _1 = __atomic_fetch_or_4 (&v, 1, 0);
> + _2 = (int) _1;
> + _5 = _2 & 1;
> +
> +to
> +
> + _1 = __atomic_fetch_or_4 (&v, 1, 0);
> + _2 = _1 & 1;
> + _5 = (int) _2;
>
> +/* Convert
> + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
> + _2 = (int) _1;
> + _3 = _2 & 8192;
> +to
> + _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
> + _7 = _1 & 8192;
> + _6 = (int) _7;
> + So it can be handled by optimize_atomic_bit_test_and. */
>
> I'm trying to rewrite match part in match.pd and find the
> canonicalization is ok when mask is constant, but not for variable
> since it will be simplified back by
> /* In GIMPLE, getting rid of 2 conversions for one new results
> in smaller IL. */
> (simplify
> (convert (bitop:cs@2 (nop_convert:s @0) @1))
> (if (GIMPLE
> && TREE_CODE (@1) != INTEGER_CST
> && tree_nop_conversion_p (type, TREE_TYPE (@2))
> && types_match (type, @0))
> (bitop @0 (convert @1)))))
>
> The canonicalization for variabled is like
>
> convert
> _1 = ~mask_7;
> _2 = (unsigned int) _1;
> _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
> _4 = (int) _3;
> _5 = _4 & mask_7;
>
> to
> _1 = ~mask_7;
> _2 = (unsigned int) _1;
> _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
> _4 = (unsigned int) mask_7
> _6 = _3 & _4
> _5 = (int) _6
>
> and be simplified back.
>
> I've also tried another way of simplication like
>
> convert
> _1 = ~mask_7;
> _2 = (unsigned int) _1;
> _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
> _4 = (int) _3;
> _5 = _4 & mask_7;
>
> to
> _1 = (unsigned int)mask_7;
> _2 = ~ _1;
> _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
> _6 = _3 & _1
> _5 = (int)
>
> but it's prevent by below since __atomic_fetch_and_4 is not CONST, but
> we need to regenerate it with updated parameter.
>
> /* We can't and should not emit calls to non-const functions. */
> if (!(flags_from_decl_or_type (decl) & ECF_CONST))
> return NULL;
>
> gcc/ChangeLog:
>
> * match.pd: Canonicalize __atomic/sync_fetch_or/xor/and for
> constant mask.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr102566-1a.c: New test.
> * gcc.target/i386/pr102566-2a.c: New test.
> ---
> gcc/match.pd | 114 ++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr102566-1a.c | 66 ++++++++++++
> gcc/testsuite/gcc.target/i386/pr102566-2a.c | 65 +++++++++++
> 3 files changed, 245 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 5bed2e12715..545a243eae6 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> (define_operator_list COND_TERNARY
> IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
>
> +/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_* */
> +(define_operator_list ATOMIC_FETCH_OR_XOR_N
> + BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2
> + BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8
> + BUILT_IN_ATOMIC_FETCH_OR_16
> + BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2
> + BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8
> + BUILT_IN_ATOMIC_FETCH_XOR_16
> + BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2
> + BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8
> + BUILT_IN_ATOMIC_XOR_FETCH_16)
> +/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_* */
> +(define_operator_list SYNC_FETCH_OR_XOR_N
> + BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2
> + BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8
> + BUILT_IN_SYNC_FETCH_AND_OR_16
> + BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2
> + BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8
> + BUILT_IN_SYNC_FETCH_AND_XOR_16
> + BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2
> + BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8
> + BUILT_IN_SYNC_XOR_AND_FETCH_16)
> +/* __atomic_fetch_and_*. */
> +(define_operator_list ATOMIC_FETCH_AND_N
> + BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2
> + BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8
> + BUILT_IN_ATOMIC_FETCH_AND_16)
> +/* __sync_fetch_and_and_*. */
> +(define_operator_list SYNC_FETCH_AND_AND_N
> + BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2
> + BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8
> + BUILT_IN_SYNC_FETCH_AND_AND_16)
> +
> /* With nop_convert? combine convert? and view_convert? in one pattern
> plus conditionalize on tree_nop_conversion_p conversions. */
> (match (nop_convert @0)
> @@ -3907,6 +3940,87 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> (vec_cond @0 (op! @3 @1) (op! @3 @2))))
> #endif
>
> +#if GIMPLE
> +/* Canonicalize
> + _1 = __atomic_fetch_or_4 (&v, 1, 0);
> + _2 = (int) _1;
> + _5 = _2 & 1;
> +
> +to
> +
> + _1 = __atomic_fetch_or_4 (&v, 1, 0);
> + _2 = _1 & 1;
> + _5 = (int) _2;
> +
> + So it can be handled by optimize_atomic_bit_test_and. */
> +(simplify
> + (bit_and
> + (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2))
> + INTEGER_CST@4)
> + (if (single_use (@5))
> + (with { int ibit = tree_log2 (@1);
> + int ibit2 = tree_log2 (@4); }
> + (if (ibit >= 0 && ibit == ibit2)
> + /* Make sure the second operand have the same type as @3
> + orelse will hit gcc_asssert. */
> + (convert:type
> + (bit_and @3
> + { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +
> +(simplify
> + (bit_and
> + (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1))
> + INTEGER_CST@2)
> + (if (single_use (@4))
> + (with { int ibit = tree_log2 (@1);
> + int ibit2 = tree_log2 (@2); }
> + (if (ibit >= 0 && ibit == ibit2)
> + /* Make sure the second operand have the same type as @3
> + orelse will hit gcc_asssert. */
> + (convert:type
> + (bit_and @3
> + { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +/* Convert
> + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
> + _2 = (int) _1;
> + _3 = _2 & 8192;
> +to
> + _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
> + _7 = _1 & 8192;
> + _6 = (int) _7;
> + So it can be handled by optimize_atomic_bit_test_and. */
> +
> +(simplify
> + (bit_and
> + (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2))
> + INTEGER_CST@4)
> + (if (single_use (@5))
> + (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
> + TYPE_PRECISION(type)));
> + int ibit2 = tree_log2 (@4); }
> + (if (ibit >= 0 && ibit == ibit2)
> + /* Make sure the second operand have the same type as @3
> + orelse will hit gcc_asssert. */
> + (convert:type
> + (bit_and @3
> + { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +
Now we have single_use condition first
static bool
gimple_simplify_469 (gimple_match_op *res_op, gimple_seq *seq,
tree (*valueize)(tree) ATTRIBUTE_UNUSED,
const tree ARG_UNUSED (type), tree *ARG_UNUSED (captures)
, const combined_fn ARG_UNUSED (ATOMIC_FETCH_OR_XOR_N))
{
/* #line 3960 "/export/users2/liuhongt/gcc/intel-innersource/lock/gcc/match.pd"
*/
if (single_use (captures[0])
)
{
{
/* #line 3961 "/export/users2/liuhongt/gcc/intel-innersource/lock/gcc/match.pd"
*/
int ibit = tree_log2 (captures[3]);
int ibit2 = tree_log2 (captures[5]);
/* #line 3963 "/export/users2/liuhongt/gcc/intel-innersource/lock/gcc/match.pd"
*/
if (ibit >= 0 && ibit == ibit2
)
{
gimple_seq *lseq = seq;
if (__builtin_expect (!dbg_cnt (match), 0)) goto
next_after_fail763;
if (__builtin_expect (dump_file && (dump_flags &
TDF_FOLDING), 0)) fprintf (dump_file, "Applying pattern %s:%d,
%s:%d\n", "match.pd", 3966, __FILE__, __LINE__);
{
res_op->set_op (NOP_EXPR, type, 1);
{
tree _o1[2], _r1;
_o1[0] = captures[1];
_o1[1] = build_int_cst (TREE_TYPE (captures[1]),
HOST_WIDE_INT_1U << ibit);
gimple_match_op tem_op (res_op->cond.any_else (),
BIT_AND_EXPR, TREE_TYPE (_o1[0]), _o1[0], _o1[1]);
tem_op.resimplify (lseq, valueize);
_r1 = maybe_push_res_to_seq (&tem_op, lseq);
if (!_r1) goto next_after_fail763;
res_op->ops[0] = _r1;
}
res_op->resimplify (lseq, valueize);
return true;
}
next_after_fail763:;
> +(simplify
> + (bit_and
> + (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1))
> + INTEGER_CST@2)
> + (if (single_use(@4))
> + (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
> + TYPE_PRECISION(type)));
> + int ibit2 = tree_log2 (@2); }
> + (if (ibit >= 0 && ibit == ibit2)
> + /* Make sure the second operand have the same type as @3
> + orelse will hit gcc_asssert. */
> + (convert:type
> + (bit_and @3
> + { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +#endif
> +
> /* (v ? w : 0) ? a : b is just (v & w) ? a : b
> Currently disabled after pass lvec because ARM understands
> VEC_COND_EXPR<v==w,-1,0> but not a plain v==w fed to BIT_IOR_EXPR. */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> new file mode 100644
> index 00000000000..2657a2f62ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> @@ -0,0 +1,66 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +
> +#define FOO(TYPE,MASK) \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __sync_fetch_and_or (a, mask) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __sync_fetch_and_xor (a, mask) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __sync_xor_and_fetch (a, mask) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1 << MASK; \
> + return __sync_fetch_and_and (a, ~mask) & mask; \
> + } \
> +
> +FOO(short, 0);
> +FOO(short, 7);
> +FOO(short, 15);
> +FOO(int, 0);
> +FOO(int, 15);
> +FOO(int, 31);
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2a.c b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
> new file mode 100644
> index 00000000000..24681c1da18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
> @@ -0,0 +1,65 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2" } */
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +typedef long long int64;
> +
> +#define FOO(TYPE,MASK) \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __sync_fetch_and_or (a, mask) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __sync_fetch_and_xor (a, mask) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __sync_xor_and_fetch (a, mask) & mask; \
> + } \
> + __attribute__((noinline,noclone)) TYPE \
> + sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a) \
> + { \
> + TYPE mask = 1ll << MASK; \
> + return __sync_fetch_and_and (a, ~mask) & mask; \
> + } \
> +
> +
> +FOO(int64, 0);
> +FOO(int64, 32);
> +FOO(int64, 63);
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> --
> 2.18.1
>
@@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(define_operator_list COND_TERNARY
IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
+/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_* */
+(define_operator_list ATOMIC_FETCH_OR_XOR_N
+ BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2
+ BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8
+ BUILT_IN_ATOMIC_FETCH_OR_16
+ BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2
+ BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8
+ BUILT_IN_ATOMIC_FETCH_XOR_16
+ BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2
+ BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8
+ BUILT_IN_ATOMIC_XOR_FETCH_16)
+/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_* */
+(define_operator_list SYNC_FETCH_OR_XOR_N
+ BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2
+ BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8
+ BUILT_IN_SYNC_FETCH_AND_OR_16
+ BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2
+ BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8
+ BUILT_IN_SYNC_FETCH_AND_XOR_16
+ BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2
+ BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8
+ BUILT_IN_SYNC_XOR_AND_FETCH_16)
+/* __atomic_fetch_and_*. */
+(define_operator_list ATOMIC_FETCH_AND_N
+ BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2
+ BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8
+ BUILT_IN_ATOMIC_FETCH_AND_16)
+/* __sync_fetch_and_and_*. */
+(define_operator_list SYNC_FETCH_AND_AND_N
+ BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2
+ BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8
+ BUILT_IN_SYNC_FETCH_AND_AND_16)
+
/* With nop_convert? combine convert? and view_convert? in one pattern
plus conditionalize on tree_nop_conversion_p conversions. */
(match (nop_convert @0)
@@ -3907,6 +3940,87 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(vec_cond @0 (op! @3 @1) (op! @3 @2))))
#endif
+#if GIMPLE
+/* Canonicalize
+ _1 = __atomic_fetch_or_4 (&v, 1, 0);
+ _2 = (int) _1;
+ _5 = _2 & 1;
+
+to
+
+ _1 = __atomic_fetch_or_4 (&v, 1, 0);
+ _2 = _1 & 1;
+ _5 = (int) _2;
+
+ So it can be handled by optimize_atomic_bit_test_and. */
+(simplify
+ (bit_and
+ (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2))
+ INTEGER_CST@4)
+ (if (single_use (@5))
+ (with { int ibit = tree_log2 (@1);
+ int ibit2 = tree_log2 (@4); }
+ (if (ibit >= 0 && ibit == ibit2)
+ /* Make sure the second operand have the same type as @3
+ orelse will hit gcc_asssert. */
+ (convert:type
+ (bit_and @3
+ { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+
+(simplify
+ (bit_and
+ (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1))
+ INTEGER_CST@2)
+ (if (single_use (@4))
+ (with { int ibit = tree_log2 (@1);
+ int ibit2 = tree_log2 (@2); }
+ (if (ibit >= 0 && ibit == ibit2)
+ /* Make sure the second operand have the same type as @3
+ orelse will hit gcc_asssert. */
+ (convert:type
+ (bit_and @3
+ { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+/* Convert
+ _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
+ _2 = (int) _1;
+ _3 = _2 & 8192;
+to
+ _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
+ _7 = _1 & 8192;
+ _6 = (int) _7;
+ So it can be handled by optimize_atomic_bit_test_and. */
+
+(simplify
+ (bit_and
+ (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2))
+ INTEGER_CST@4)
+ (if (single_use (@5))
+ (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
+ TYPE_PRECISION(type)));
+ int ibit2 = tree_log2 (@4); }
+ (if (ibit >= 0 && ibit == ibit2)
+ /* Make sure the second operand have the same type as @3
+ orelse will hit gcc_asssert. */
+ (convert:type
+ (bit_and @3
+ { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+
+(simplify
+ (bit_and
+ (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1))
+ INTEGER_CST@2)
+ (if (single_use(@4))
+ (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
+ TYPE_PRECISION(type)));
+ int ibit2 = tree_log2 (@2); }
+ (if (ibit >= 0 && ibit == ibit2)
+ /* Make sure the second operand have the same type as @3
+ orelse will hit gcc_asssert. */
+ (convert:type
+ (bit_and @3
+ { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+#endif
+
/* (v ? w : 0) ? a : b is just (v & w) ? a : b
Currently disabled after pass lvec because ARM understands
VEC_COND_EXPR<v==w,-1,0> but not a plain v==w fed to BIT_IOR_EXPR. */
new file mode 100644
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+#include <stdatomic.h>
+#include <stdbool.h>
+
+#define FOO(TYPE,MASK) \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __sync_fetch_and_or (a, mask) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __sync_fetch_and_xor (a, mask) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __sync_xor_and_fetch (a, mask) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1 << MASK; \
+ return __sync_fetch_and_and (a, ~mask) & mask; \
+ } \
+
+FOO(short, 0);
+FOO(short, 7);
+FOO(short, 15);
+FOO(int, 0);
+FOO(int, 15);
+FOO(int, 31);
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
new file mode 100644
@@ -0,0 +1,65 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+#include <stdatomic.h>
+#include <stdbool.h>
+typedef long long int64;
+
+#define FOO(TYPE,MASK) \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __sync_fetch_and_or (a, mask) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __sync_fetch_and_xor (a, mask) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __sync_xor_and_fetch (a, mask) & mask; \
+ } \
+ __attribute__((noinline,noclone)) TYPE \
+ sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a) \
+ { \
+ TYPE mask = 1ll << MASK; \
+ return __sync_fetch_and_and (a, ~mask) & mask; \
+ } \
+
+
+FOO(int64, 0);
+FOO(int64, 32);
+FOO(int64, 63);
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */