Improve integer bit test on atomic builtin return

Message ID 20211004135354.1395400-1-hjl.tools@gmail.com
State New
Headers
Series Improve integer bit test on atomic builtin return |

Commit Message

H.J. Lu Oct. 4, 2021, 1:53 p.m. UTC
  commit adedd5c173388ae505470df152b9cb3947339566
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Tue May 3 13:37:25 2016 +0200

    re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')

optimized bit test on atomic builtin return with lock bts/btr/btc.  But
it works only for unsigned integers since atomic builtins operate on the
'uintptr_t' type.  It fails on bool:

  _1 = atomic builtin;
  _4 = (_Bool) _1;

and signed integers:

  _1 = atomic builtin;
  _2 = (int) _1;
  _5 = _2 & (1 << N);

Improve bit test on atomic builtin return by converting:

  _1 = atomic builtin;
  _4 = (_Bool) _1;

to

  _1 = atomic builtin;
  _5 = _1 & (1 << 0);
  _4 = (_Bool) _5;

and converting:

  _1 = atomic builtin;
  _2 = (int) _1;
  _5 = _2 & (1 << N);

to
  _1 = atomic builtin;
  _6 = _1 & (1 << N);
  _5 = (int) _6;

gcc/

	PR middle-end/102566
	* tree-ssa-ccp.c (optimize_atomic_bit_test_and): Handle cast
	between atomic builtin and bit test.

gcc/testsuite/

	PR middle-end/102566
	* g++.target/i386/pr102566-1.C: New test.
	* gcc.target/i386/pr102566-1a.c: Likewise.
	* gcc.target/i386/pr102566-1b.c: Likewise.
	* gcc.target/i386/pr102566-2.c: Likewise.
---
 gcc/testsuite/g++.target/i386/pr102566-1.C  |  12 ++
 gcc/testsuite/gcc.target/i386/pr102566-1a.c | 188 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-1b.c | 107 +++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-2.c  |  14 ++
 gcc/tree-ssa-ccp.c                          | 136 +++++++++++++-
 5 files changed, 452 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2.c
  

Comments

Richard Biener Oct. 5, 2021, 10:07 a.m. UTC | #1
On Mon, 4 Oct 2021, H.J. Lu wrote:

> commit adedd5c173388ae505470df152b9cb3947339566
> Author: Jakub Jelinek <jakub@redhat.com>
> Date:   Tue May 3 13:37:25 2016 +0200
> 
>     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> 
> optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> it works only for unsigned integers since atomic builtins operate on the
> 'uintptr_t' type.  It fails on bool:
> 
>   _1 = atomic builtin;
>   _4 = (_Bool) _1;
> 
> and signed integers:
> 
>   _1 = atomic builtin;
>   _2 = (int) _1;
>   _5 = _2 & (1 << N);
> 
> Improve bit test on atomic builtin return by converting:
> 
>   _1 = atomic builtin;
>   _4 = (_Bool) _1;
> 
> to
> 
>   _1 = atomic builtin;
>   _5 = _1 & (1 << 0);
>   _4 = (_Bool) _5;
> 
> and converting:
> 
>   _1 = atomic builtin;
>   _2 = (int) _1;
>   _5 = _2 & (1 << N);
> 
> to
>   _1 = atomic builtin;
>   _6 = _1 & (1 << N);
>   _5 = (int) _6;

Why not do this last bit with match.pd patterns (and independent on
whether _1 is defined by an atomic builtin)?  For the first suggested
transform that's likely going to be undone by folding, no?

Richard.

> gcc/
> 
> 	PR middle-end/102566
> 	* tree-ssa-ccp.c (optimize_atomic_bit_test_and): Handle cast
> 	between atomic builtin and bit test.
> 
> gcc/testsuite/
> 
> 	PR middle-end/102566
> 	* g++.target/i386/pr102566-1.C: New test.
> 	* gcc.target/i386/pr102566-1a.c: Likewise.
> 	* gcc.target/i386/pr102566-1b.c: Likewise.
> 	* gcc.target/i386/pr102566-2.c: Likewise.
> ---
>  gcc/testsuite/g++.target/i386/pr102566-1.C  |  12 ++
>  gcc/testsuite/gcc.target/i386/pr102566-1a.c | 188 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-1b.c | 107 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-2.c  |  14 ++
>  gcc/tree-ssa-ccp.c                          | 136 +++++++++++++-
>  5 files changed, 452 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr102566-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2.c
> 
> diff --git a/gcc/testsuite/g++.target/i386/pr102566-1.C b/gcc/testsuite/g++.target/i386/pr102566-1.C
> new file mode 100644
> index 00000000000..6e33298d8bf
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr102566-1.C
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target c++11 } } */
> +/* { dg-options "-O2" } */
> +
> +#include <atomic>
> +
> +bool tbit(std::atomic<int> &i)
> +{
> +  return i.fetch_or(1, std::memory_order_relaxed) & 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> new file mode 100644
> index 00000000000..a915de354e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> @@ -0,0 +1,188 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +void bar (void);
> +
> +__attribute__((noinline, noclone)) int
> +f1 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f2 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED);
> +  int t2 = t1 & mask;
> +  return t2 != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f3 (long int *a, int bit)
> +{
> +  long int mask = 1l << bit;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f4 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f5 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f6 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f7 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  if ((__sync_fetch_and_xor (a, mask) & mask) != 0)
> +    bar ();
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f8 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0)
> +    bar ();
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f9 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f10 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_xor (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f11 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f12 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f13 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f14 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f15 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f16 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f17 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f18 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f19 (long int *a, int bit)
> +{
> +  long int mask = 1l << bit;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f20 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f21 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_or (a, mask) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f22 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f23 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) short int
> +f24 (short int *a)
> +{
> +  short int mask = 1 << 7;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) short int
> +f25 (short int *a)
> +{
> +  short int mask = 1 << 7;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1b.c b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
> new file mode 100644
> index 00000000000..c4dab8135c7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
> @@ -0,0 +1,107 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -g" } */
> +
> +int cnt;
> +
> +__attribute__((noinline, noclone)) void
> +bar (void)
> +{
> +  cnt++;
> +}
> +
> +#include "pr102566-1a.c"
> +
> +int a;
> +long int b;
> +unsigned long int c;
> +unsigned short int d;
> +
> +int
> +main ()
> +{
> +  __atomic_store_n (&a, 15, __ATOMIC_RELAXED);
> +  if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15
> +      || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31)
> +    __builtin_abort ();
> +  if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31
> +      || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63)
> +    __builtin_abort ();
> +  __atomic_store_n (&b, 24, __ATOMIC_RELAXED);
> +  if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28
> +      || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 0, __ATOMIC_RELAXED);
> +  if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128
> +      || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128)
> +    __builtin_abort ();
> +  if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
> +      || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320)
> +    __builtin_abort ();
> +  if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321
> +      || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (cnt != 0
> +      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
> +      || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
> +      || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
> +      || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
> +    __builtin_abort ();
> +  if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
> +      || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
> +      || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 8321, __ATOMIC_RELAXED);
> +  if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
> +    __builtin_abort ();
> +  if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
> +      || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
> +      || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
> +      || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
> +      || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 128, __ATOMIC_RELAXED);
> +  if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144
> +      || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144)
> +    __builtin_abort ();
> +  __atomic_store_n (&c, 1, __ATOMIC_RELAXED);
> +  if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
> +      || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
> +      || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128
> +      || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128)
> +    __builtin_abort ();
> +  __atomic_store_n (&d, 1, __ATOMIC_RELAXED);
> +  if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
> +      || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
> +      || cnt != 2)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2.c b/gcc/testsuite/gcc.target/i386/pr102566-2.c
> new file mode 100644
> index 00000000000..d1c30315353
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +
> +bool
> +foo (_Atomic int *v)
> +{
> +  return atomic_fetch_or_explicit (v, 1, memory_order_relaxed) & 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
> index 70ce6a4d5b8..a3f7b7f233e 100644
> --- a/gcc/tree-ssa-ccp.c
> +++ b/gcc/tree-ssa-ccp.c
> @@ -3279,10 +3279,115 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>        || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)
>        || !single_imm_use (lhs, &use_p, &use_stmt)
>        || !is_gimple_assign (use_stmt)
> -      || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR
>        || !gimple_vdef (call))
>      return;
>  
> +  mask = gimple_call_arg (call, 1);
> +  tree_code rhs_code = gimple_assign_rhs_code (use_stmt);
> +  if (rhs_code != BIT_AND_EXPR)
> +    {
> +      if (rhs_code != NOP_EXPR)
> +	return;
> +
> +      tree nop_lhs = gimple_assign_lhs (use_stmt);
> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (nop_lhs))
> +	return;
> +
> +      tree nop_rhs = gimple_assign_rhs1 (use_stmt);
> +
> +      gimple *g;
> +      gimple_stmt_iterator gsi;
> +      tree var;
> +
> +      if (TREE_CODE (TREE_TYPE (nop_lhs)) == BOOLEAN_TYPE)
> +	{
> +	  /* Convert
> +	     _1 = atomic bit op;
> +	     _4 = (_Bool) _1;
> +	     to
> +	     _1 = atomic bit op;
> +	     _5 = _1 & 1;
> +	     _4 = (_Bool) _5;
> +	   */
> +	  var = make_ssa_name (TREE_TYPE (nop_rhs));
> +	  replace_uses_by (nop_rhs, var);
> +	  g = gimple_build_assign (var, BIT_AND_EXPR, nop_rhs,
> +				   build_int_cst (TREE_TYPE (lhs), 1));
> +	  gsi = gsi_for_stmt (use_stmt);
> +	  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
> +	  use_stmt = g;
> +	}
> +      else if (TYPE_PRECISION (TREE_TYPE (nop_lhs))
> +	       == TYPE_PRECISION (TREE_TYPE (nop_rhs)))
> +	{
> +	  gimple *use_nop_stmt;
> +	  if (!single_imm_use (nop_lhs, &use_p, &use_nop_stmt)
> +	      || !is_gimple_assign (use_nop_stmt)
> +	      || gimple_assign_rhs_code (use_nop_stmt) != BIT_AND_EXPR)
> +	    return;
> +
> +	  tree op_mask = mask;
> +	  if (TREE_CODE (op_mask) == SSA_NAME)
> +	    {
> +	      g = SSA_NAME_DEF_STMT (op_mask);
> +	      if (gimple_assign_rhs_code (g) == NOP_EXPR)
> +		{
> +		  tree mask_nop_lhs = gimple_assign_lhs (g);
> +
> +		  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (mask_nop_lhs))
> +		    return;
> +
> +		  tree mask_nop_rhs = gimple_assign_rhs1 (g);
> +		  if (TYPE_PRECISION (TREE_TYPE (mask_nop_lhs))
> +		      != TYPE_PRECISION (TREE_TYPE (mask_nop_rhs)))
> +		    return;
> +		  op_mask = mask_nop_rhs;
> +		  g = SSA_NAME_DEF_STMT (op_mask);
> +		}
> +
> +	      if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
> +		{
> +		  if (!is_gimple_assign (g)
> +		      || gimple_assign_rhs_code (g) != BIT_NOT_EXPR)
> +		    return;
> +		  tree reset_mask = gimple_assign_rhs1 (g);
> +		  if (TREE_CODE (op_mask) != SSA_NAME)
> +		    return;
> +		  g = SSA_NAME_DEF_STMT (reset_mask);
> +		}
> +
> +	      if (!is_gimple_assign (g)
> +		  || gimple_assign_rhs_code (g) != LSHIFT_EXPR
> +		  || !integer_onep (gimple_assign_rhs1 (g)))
> +		return;
> +	    }
> +
> +	  /* Convert
> +	     _1 = atomic bit op;
> +	     _2 = (int) _1;
> +	     _5 = _2 & N;
> +	     to
> +	     _1 = atomic bit op;
> +	     _6 = _1 & N;
> +	     _5 = (int) _6;
> +	   */
> +	  replace_uses_by (nop_lhs, lhs);
> +	  tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
> +	  var = make_ssa_name (TREE_TYPE (use_nop_lhs));
> +	  gimple_assign_set_lhs (use_nop_stmt, var);
> +	  gsi = gsi_for_stmt (use_stmt);
> +	  gsi_remove (&gsi, true);
> +	  release_defs (use_stmt);
> +	  gsi_remove (gsip, true);
> +	  var = build1 (NOP_EXPR, TREE_TYPE (use_nop_lhs), var);
> +	  gsi = gsi_for_stmt (use_nop_stmt);
> +	  g = gimple_build_assign (use_nop_lhs, var);
> +	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	  use_stmt = use_nop_stmt;
> +	  mask = op_mask;
> +	}
> +    }
> +
>    switch (fn)
>      {
>      case IFN_ATOMIC_BIT_TEST_AND_SET:
> @@ -3301,7 +3406,6 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>    if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing)
>      return;
>  
> -  mask = gimple_call_arg (call, 1);
>    tree use_lhs = gimple_assign_lhs (use_stmt);
>    if (!use_lhs)
>      return;
> @@ -3434,18 +3538,40 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>  	 of the specified bit after the atomic operation (makes only sense
>  	 for xor, otherwise the bit content is compile time known),
>  	 we need to invert the bit.  */
> +      tree mask_convert = mask;
> +      gimple *g_convert = nullptr;
> +      if (!use_bool && TREE_TYPE (lhs) != TREE_TYPE (mask))
> +	{
> +	  mask_convert = make_ssa_name (TREE_TYPE (lhs));
> +	  tree var = build1 (NOP_EXPR, TREE_TYPE (lhs), mask);
> +	  g_convert = gimple_build_assign (mask_convert, var);
> +	}
>        g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)),
>  			       BIT_XOR_EXPR, new_lhs,
>  			       use_bool ? build_int_cst (TREE_TYPE (lhs), 1)
> -					: mask);
> +					: mask_convert);
>        new_lhs = gimple_assign_lhs (g);
>        if (throws)
>  	{
> -	  gsi_insert_on_edge_immediate (e, g);
> +	  if (g_convert)
> +	    {
> +	      gsi_insert_on_edge_immediate (e, g_convert);
> +	      gsi = gsi_for_stmt (g_convert);
> +	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	    }
> +	  else
> +	    gsi_insert_on_edge_immediate (e, g);
>  	  gsi = gsi_for_stmt (g);
>  	}
>        else
> -	gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	{
> +	  if (g_convert)
> +	    {
> +	      gsi_insert_after (&gsi, g_convert, GSI_NEW_STMT);
> +	      gsi = gsi_for_stmt (g_convert);
> +	    }
> +	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	}
>      }
>    if (use_bool && has_debug_uses)
>      {
>
  
H.J. Lu Oct. 5, 2021, 4:40 p.m. UTC | #2
On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
>
> On Mon, 4 Oct 2021, H.J. Lu wrote:
>
> > commit adedd5c173388ae505470df152b9cb3947339566
> > Author: Jakub Jelinek <jakub@redhat.com>
> > Date:   Tue May 3 13:37:25 2016 +0200
> >
> >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> >
> > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > it works only for unsigned integers since atomic builtins operate on the
> > 'uintptr_t' type.  It fails on bool:
> >
> >   _1 = atomic builtin;
> >   _4 = (_Bool) _1;
> >
> > and signed integers:
> >
> >   _1 = atomic builtin;
> >   _2 = (int) _1;
> >   _5 = _2 & (1 << N);
> >
> > Improve bit test on atomic builtin return by converting:
> >
> >   _1 = atomic builtin;
> >   _4 = (_Bool) _1;
> >
> > to
> >
> >   _1 = atomic builtin;
> >   _5 = _1 & (1 << 0);
> >   _4 = (_Bool) _5;
> >
> > and converting:
> >
> >   _1 = atomic builtin;
> >   _2 = (int) _1;
> >   _5 = _2 & (1 << N);
> >
> > to
> >   _1 = atomic builtin;
> >   _6 = _1 & (1 << N);
> >   _5 = (int) _6;
>
> Why not do this last bit with match.pd patterns (and independent on
> whether _1 is defined by an atomic builtin)?  For the first suggested

The full picture is

 _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
  _2 = (int) _1;
  _5 = _2 & mask;

to

  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
  _6 = _1 & mask;
  _5 = (int) _6;

It is useful only if 2 masks are the same.

> transform that's likely going to be undone by folding, no?
>

The bool case is

  _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
  _4 = (_Bool) _1;

to

  _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
  _5 = _1 & 1;
  _4 = (_Bool) _5;

Without __atomic_fetch_or_*, the conversion isn't needed.
After the conversion, optimize_atomic_bit_test_and will
immediately optimize the code sequence to

  _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
  _4 = (_Bool) _6;

and there is nothing to fold after it.
  
Richard Biener Oct. 8, 2021, 7:16 a.m. UTC | #3
On Tue, 5 Oct 2021, H.J. Lu wrote:

> On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Mon, 4 Oct 2021, H.J. Lu wrote:
> >
> > > commit adedd5c173388ae505470df152b9cb3947339566
> > > Author: Jakub Jelinek <jakub@redhat.com>
> > > Date:   Tue May 3 13:37:25 2016 +0200
> > >
> > >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> > >
> > > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > > it works only for unsigned integers since atomic builtins operate on the
> > > 'uintptr_t' type.  It fails on bool:
> > >
> > >   _1 = atomic builtin;
> > >   _4 = (_Bool) _1;
> > >
> > > and signed integers:
> > >
> > >   _1 = atomic builtin;
> > >   _2 = (int) _1;
> > >   _5 = _2 & (1 << N);
> > >
> > > Improve bit test on atomic builtin return by converting:
> > >
> > >   _1 = atomic builtin;
> > >   _4 = (_Bool) _1;
> > >
> > > to
> > >
> > >   _1 = atomic builtin;
> > >   _5 = _1 & (1 << 0);
> > >   _4 = (_Bool) _5;
> > >
> > > and converting:
> > >
> > >   _1 = atomic builtin;
> > >   _2 = (int) _1;
> > >   _5 = _2 & (1 << N);
> > >
> > > to
> > >   _1 = atomic builtin;
> > >   _6 = _1 & (1 << N);
> > >   _5 = (int) _6;
> >
> > Why not do this last bit with match.pd patterns (and independent on
> > whether _1 is defined by an atomic builtin)?  For the first suggested
> 
> The full picture is
> 
>  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
>   _2 = (int) _1;
>   _5 = _2 & mask;
> 
> to
> 
>   _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
>   _6 = _1 & mask;
>   _5 = (int) _6;
> 
> It is useful only if 2 masks are the same.
> 
> > transform that's likely going to be undone by folding, no?
> >
> 
> The bool case is
> 
>   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
>   _4 = (_Bool) _1;
> 
> to
> 
>   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
>   _5 = _1 & 1;
>   _4 = (_Bool) _5;
> 
> Without __atomic_fetch_or_*, the conversion isn't needed.
> After the conversion, optimize_atomic_bit_test_and will
> immediately optimize the code sequence to
> 
>   _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
>   _4 = (_Bool) _6;
> 
> and there is nothing to fold after it.

Hmm, I see - so how about instead teaching the code that
produces the .ATOMIC_BIT_TEST_AND_SET the alternate forms instead
of doing the intermediate step separately?

Sorry for the delay btw, I've been busy all week ...

Thanks,
Richard.
  
H.J. Lu Oct. 8, 2021, 2:55 p.m. UTC | #4
On Fri, Oct 8, 2021 at 12:16 AM Richard Biener <rguenther@suse.de> wrote:
>
> On Tue, 5 Oct 2021, H.J. Lu wrote:
>
> > On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Mon, 4 Oct 2021, H.J. Lu wrote:
> > >
> > > > commit adedd5c173388ae505470df152b9cb3947339566
> > > > Author: Jakub Jelinek <jakub@redhat.com>
> > > > Date:   Tue May 3 13:37:25 2016 +0200
> > > >
> > > >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> > > >
> > > > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > > > it works only for unsigned integers since atomic builtins operate on the
> > > > 'uintptr_t' type.  It fails on bool:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _4 = (_Bool) _1;
> > > >
> > > > and signed integers:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _2 = (int) _1;
> > > >   _5 = _2 & (1 << N);
> > > >
> > > > Improve bit test on atomic builtin return by converting:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _4 = (_Bool) _1;
> > > >
> > > > to
> > > >
> > > >   _1 = atomic builtin;
> > > >   _5 = _1 & (1 << 0);
> > > >   _4 = (_Bool) _5;
> > > >
> > > > and converting:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _2 = (int) _1;
> > > >   _5 = _2 & (1 << N);
> > > >
> > > > to
> > > >   _1 = atomic builtin;
> > > >   _6 = _1 & (1 << N);
> > > >   _5 = (int) _6;
> > >
> > > Why not do this last bit with match.pd patterns (and independent on
> > > whether _1 is defined by an atomic builtin)?  For the first suggested
> >
> > The full picture is
> >
> >  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
> >   _2 = (int) _1;
> >   _5 = _2 & mask;
> >
> > to
> >
> >   _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
> >   _6 = _1 & mask;
> >   _5 = (int) _6;
> >
> > It is useful only if 2 masks are the same.
> >
> > > transform that's likely going to be undone by folding, no?
> > >
> >
> > The bool case is
> >
> >   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
> >   _4 = (_Bool) _1;
> >
> > to
> >
> >   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
> >   _5 = _1 & 1;
> >   _4 = (_Bool) _5;
> >
> > Without __atomic_fetch_or_*, the conversion isn't needed.
> > After the conversion, optimize_atomic_bit_test_and will
> > immediately optimize the code sequence to
> >
> >   _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
> >   _4 = (_Bool) _6;
> >
> > and there is nothing to fold after it.
>
> Hmm, I see - so how about instead teaching the code that
> produces the .ATOMIC_BIT_TEST_AND_SET the alternate forms instead
> of doing the intermediate step separately?
>

The old algorithm is

1.  Check gimple forms.  Return if the form isn't supported.
2.  Do transformation.

My current approach treats the gimple forms accepted by the
old algorithm as canonical forms and changes the algorithm
to

1.  If gimple forms aren't canonical, then
       a. If gimple forms can't be transformed to canonical forms,
           return;
       b. Transform to canonical form.
   endif
2.  Check gimple forms.  Return if the form isn't supported.
3.  Do transformation.

The #2 check is redundant when gimple forms have been
transformed to canonical forms.

I can change my patch to

1.  If gimple forms aren't canonical, then
       a. If gimple forms can't be transformed to canonical forms,
           return;
       b. Transform to canonical form.
    else
      Check gimple forms. Return if the form isn't supported.
    endif
2.  Do transformation.

The advantage of canonical forms is that we don't have to
transform all different forms.

Does it sound OK?

Thanks.
  

Patch

diff --git a/gcc/testsuite/g++.target/i386/pr102566-1.C b/gcc/testsuite/g++.target/i386/pr102566-1.C
new file mode 100644
index 00000000000..6e33298d8bf
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-1.C
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool tbit(std::atomic<int> &i)
+{
+  return i.fetch_or(1, std::memory_order_relaxed) & 1;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
new file mode 100644
index 00000000000..a915de354e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
@@ -0,0 +1,188 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void bar (void);
+
+__attribute__((noinline, noclone)) int
+f1 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f2 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED);
+  int t2 = t1 & mask;
+  return t2 != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f3 (long int *a, int bit)
+{
+  long int mask = 1l << bit;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f5 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f6 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  if ((__sync_fetch_and_xor (a, mask) & mask) != 0)
+    bar ();
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0)
+    bar ();
+}
+
+__attribute__((noinline, noclone)) int
+f9 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f10 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_xor (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f11 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f12 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f13 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f14 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f15 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f16 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f17 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f18 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f19 (long int *a, int bit)
+{
+  long int mask = 1l << bit;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f20 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
+}
+
+__attribute__((noinline, noclone)) int
+f21 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_or (a, mask) & mask);
+}
+
+__attribute__((noinline, noclone)) long int
+f22 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask);
+}
+
+__attribute__((noinline, noclone)) long int
+f23 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask);
+}
+
+__attribute__((noinline, noclone)) short int
+f24 (short int *a)
+{
+  short int mask = 1 << 7;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) short int
+f25 (short int *a)
+{
+  short int mask = 1 << 7;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1b.c b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
new file mode 100644
index 00000000000..c4dab8135c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
@@ -0,0 +1,107 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -g" } */
+
+int cnt;
+
+__attribute__((noinline, noclone)) void
+bar (void)
+{
+  cnt++;
+}
+
+#include "pr102566-1a.c"
+
+int a;
+long int b;
+unsigned long int c;
+unsigned short int d;
+
+int
+main ()
+{
+  __atomic_store_n (&a, 15, __ATOMIC_RELAXED);
+  if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15
+      || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31)
+    __builtin_abort ();
+  if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31
+      || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63)
+    __builtin_abort ();
+  __atomic_store_n (&b, 24, __ATOMIC_RELAXED);
+  if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28
+      || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28)
+    __builtin_abort ();
+  __atomic_store_n (&a, 0, __ATOMIC_RELAXED);
+  if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128
+      || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128)
+    __builtin_abort ();
+  if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
+      || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320)
+    __builtin_abort ();
+  if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321
+      || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (cnt != 0
+      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
+      || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
+      || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
+      || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
+    __builtin_abort ();
+  if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
+      || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
+      || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  __atomic_store_n (&a, 8321, __ATOMIC_RELAXED);
+  if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
+    __builtin_abort ();
+  if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
+      || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
+      || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
+      || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
+      || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  __atomic_store_n (&a, 128, __ATOMIC_RELAXED);
+  if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144
+      || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144)
+    __builtin_abort ();
+  __atomic_store_n (&c, 1, __ATOMIC_RELAXED);
+  if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
+      || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
+      || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128
+      || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128)
+    __builtin_abort ();
+  __atomic_store_n (&d, 1, __ATOMIC_RELAXED);
+  if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
+      || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
+      || cnt != 2)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2.c b/gcc/testsuite/gcc.target/i386/pr102566-2.c
new file mode 100644
index 00000000000..d1c30315353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-2.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic int *v)
+{
+  return atomic_fetch_or_explicit (v, 1, memory_order_relaxed) & 1;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
index 70ce6a4d5b8..a3f7b7f233e 100644
--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -3279,10 +3279,115 @@  optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
       || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)
       || !single_imm_use (lhs, &use_p, &use_stmt)
       || !is_gimple_assign (use_stmt)
-      || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR
       || !gimple_vdef (call))
     return;
 
+  mask = gimple_call_arg (call, 1);
+  tree_code rhs_code = gimple_assign_rhs_code (use_stmt);
+  if (rhs_code != BIT_AND_EXPR)
+    {
+      if (rhs_code != NOP_EXPR)
+	return;
+
+      tree nop_lhs = gimple_assign_lhs (use_stmt);
+      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (nop_lhs))
+	return;
+
+      tree nop_rhs = gimple_assign_rhs1 (use_stmt);
+
+      gimple *g;
+      gimple_stmt_iterator gsi;
+      tree var;
+
+      if (TREE_CODE (TREE_TYPE (nop_lhs)) == BOOLEAN_TYPE)
+	{
+	  /* Convert
+	     _1 = atomic bit op;
+	     _4 = (_Bool) _1;
+	     to
+	     _1 = atomic bit op;
+	     _5 = _1 & 1;
+	     _4 = (_Bool) _5;
+	   */
+	  var = make_ssa_name (TREE_TYPE (nop_rhs));
+	  replace_uses_by (nop_rhs, var);
+	  g = gimple_build_assign (var, BIT_AND_EXPR, nop_rhs,
+				   build_int_cst (TREE_TYPE (lhs), 1));
+	  gsi = gsi_for_stmt (use_stmt);
+	  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
+	  use_stmt = g;
+	}
+      else if (TYPE_PRECISION (TREE_TYPE (nop_lhs))
+	       == TYPE_PRECISION (TREE_TYPE (nop_rhs)))
+	{
+	  gimple *use_nop_stmt;
+	  if (!single_imm_use (nop_lhs, &use_p, &use_nop_stmt)
+	      || !is_gimple_assign (use_nop_stmt)
+	      || gimple_assign_rhs_code (use_nop_stmt) != BIT_AND_EXPR)
+	    return;
+
+	  tree op_mask = mask;
+	  if (TREE_CODE (op_mask) == SSA_NAME)
+	    {
+	      g = SSA_NAME_DEF_STMT (op_mask);
+	      if (gimple_assign_rhs_code (g) == NOP_EXPR)
+		{
+		  tree mask_nop_lhs = gimple_assign_lhs (g);
+
+		  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (mask_nop_lhs))
+		    return;
+
+		  tree mask_nop_rhs = gimple_assign_rhs1 (g);
+		  if (TYPE_PRECISION (TREE_TYPE (mask_nop_lhs))
+		      != TYPE_PRECISION (TREE_TYPE (mask_nop_rhs)))
+		    return;
+		  op_mask = mask_nop_rhs;
+		  g = SSA_NAME_DEF_STMT (op_mask);
+		}
+
+	      if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+		{
+		  if (!is_gimple_assign (g)
+		      || gimple_assign_rhs_code (g) != BIT_NOT_EXPR)
+		    return;
+		  tree reset_mask = gimple_assign_rhs1 (g);
+		  if (TREE_CODE (op_mask) != SSA_NAME)
+		    return;
+		  g = SSA_NAME_DEF_STMT (reset_mask);
+		}
+
+	      if (!is_gimple_assign (g)
+		  || gimple_assign_rhs_code (g) != LSHIFT_EXPR
+		  || !integer_onep (gimple_assign_rhs1 (g)))
+		return;
+	    }
+
+	  /* Convert
+	     _1 = atomic bit op;
+	     _2 = (int) _1;
+	     _5 = _2 & N;
+	     to
+	     _1 = atomic bit op;
+	     _6 = _1 & N;
+	     _5 = (int) _6;
+	   */
+	  replace_uses_by (nop_lhs, lhs);
+	  tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
+	  var = make_ssa_name (TREE_TYPE (use_nop_lhs));
+	  gimple_assign_set_lhs (use_nop_stmt, var);
+	  gsi = gsi_for_stmt (use_stmt);
+	  gsi_remove (&gsi, true);
+	  release_defs (use_stmt);
+	  gsi_remove (gsip, true);
+	  var = build1 (NOP_EXPR, TREE_TYPE (use_nop_lhs), var);
+	  gsi = gsi_for_stmt (use_nop_stmt);
+	  g = gimple_build_assign (use_nop_lhs, var);
+	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	  use_stmt = use_nop_stmt;
+	  mask = op_mask;
+	}
+    }
+
   switch (fn)
     {
     case IFN_ATOMIC_BIT_TEST_AND_SET:
@@ -3301,7 +3406,6 @@  optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing)
     return;
 
-  mask = gimple_call_arg (call, 1);
   tree use_lhs = gimple_assign_lhs (use_stmt);
   if (!use_lhs)
     return;
@@ -3434,18 +3538,40 @@  optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
 	 of the specified bit after the atomic operation (makes only sense
 	 for xor, otherwise the bit content is compile time known),
 	 we need to invert the bit.  */
+      tree mask_convert = mask;
+      gimple *g_convert = nullptr;
+      if (!use_bool && TREE_TYPE (lhs) != TREE_TYPE (mask))
+	{
+	  mask_convert = make_ssa_name (TREE_TYPE (lhs));
+	  tree var = build1 (NOP_EXPR, TREE_TYPE (lhs), mask);
+	  g_convert = gimple_build_assign (mask_convert, var);
+	}
       g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)),
 			       BIT_XOR_EXPR, new_lhs,
 			       use_bool ? build_int_cst (TREE_TYPE (lhs), 1)
-					: mask);
+					: mask_convert);
       new_lhs = gimple_assign_lhs (g);
       if (throws)
 	{
-	  gsi_insert_on_edge_immediate (e, g);
+	  if (g_convert)
+	    {
+	      gsi_insert_on_edge_immediate (e, g_convert);
+	      gsi = gsi_for_stmt (g_convert);
+	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	    }
+	  else
+	    gsi_insert_on_edge_immediate (e, g);
 	  gsi = gsi_for_stmt (g);
 	}
       else
-	gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	{
+	  if (g_convert)
+	    {
+	      gsi_insert_after (&gsi, g_convert, GSI_NEW_STMT);
+	      gsi = gsi_for_stmt (g_convert);
+	    }
+	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	}
     }
   if (use_bool && has_debug_uses)
     {