i386: Relax cmpxchg instruction under -mrelax-cmpxchg-loop [PR 103069]

Message ID 20220221092908.14165-1-hongyu.wang@intel.com
State New
Headers
Series i386: Relax cmpxchg instruction under -mrelax-cmpxchg-loop [PR 103069] |

Commit Message

Hongyu Wang Feb. 21, 2022, 9:29 a.m. UTC
  Hi,

For cmpxchg, it is commonly used in spin loop, and several user code
such as pthread directly takes cmpxchg as loop condition, which cause
huge cache bouncing.

This patch extends previous implementation to relax all cmpxchg
instruction under -mrelax-cmpxchg-loop with an extra atomic load,
compare and emulate the failed cmpxchg behavior.

For original spin loop which looks like

loop: mov    %eax,%r8d
      or     $1,%r8d
      lock cmpxchg %r8d,(%rdi)
      jne    loop

It will now truns to

loop: mov    %eax,%r8d
      or     $1,%r8d
      mov    (%r8),%rsi <--- load lock first
      cmp    %rsi,%rax <--- compare with expected input
      jne    .L2 <--- lock ne expected
      lock cmpxchg %r8d,(%rdi)
      jne    loop
  L2: mov    %rsi,%rax <--- perform the behavior of failed cmpxchg
      jne    loop

under -mrelax-cmpxchg-loop.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}

OK for master?

gcc/ChangeLog:

	PR target/103069
	* config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop):
	Split atomic fetch and loop part.
	(ix86_expand_cmpxchg_loop): New expander for cmpxchg loop.
	* config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New
	prototype.
	* config/i386/sync.md (atomic_compare_and_swap<mode>): Call new
	expander under TARGET_RELAX_CMPXCHG_LOOP.
	(atomic_compare_and_swap<mode>): Likewise for doubleword modes.

gcc/testsuite/ChangeLog:

	PR target/103069
	* gcc.target/i386/pr103069-2.c: Adjust result check.
	* gcc.target/i386/pr103069-3.c: New test.
	* gcc.target/i386/pr103069-4.c: Likewise.
---
 gcc/config/i386/i386-expand.cc             | 153 +++++++++++++++------
 gcc/config/i386/i386-protos.h              |   2 +
 gcc/config/i386/sync.md                    |  65 +++++----
 gcc/testsuite/gcc.target/i386/pr103069-2.c |   4 +-
 gcc/testsuite/gcc.target/i386/pr103069-3.c |  24 ++++
 gcc/testsuite/gcc.target/i386/pr103069-4.c |  43 ++++++
 6 files changed, 226 insertions(+), 65 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-4.c
  

Comments

Uros Bizjak Feb. 21, 2022, 4:41 p.m. UTC | #1
On Mon, Feb 21, 2022 at 10:29 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> Hi,
>
> For cmpxchg, it is commonly used in spin loop, and several user code
> such as pthread directly takes cmpxchg as loop condition, which cause
> huge cache bouncing.
>
> This patch extends previous implementation to relax all cmpxchg
> instruction under -mrelax-cmpxchg-loop with an extra atomic load,
> compare and emulate the failed cmpxchg behavior.
>
> For original spin loop which looks like
>
> loop: mov    %eax,%r8d
>       or     $1,%r8d
>       lock cmpxchg %r8d,(%rdi)
>       jne    loop
>
> It will now truns to
>
> loop: mov    %eax,%r8d
>       or     $1,%r8d
>       mov    (%r8),%rsi <--- load lock first
>       cmp    %rsi,%rax <--- compare with expected input
>       jne    .L2 <--- lock ne expected
>       lock cmpxchg %r8d,(%rdi)
>       jne    loop
>   L2: mov    %rsi,%rax <--- perform the behavior of failed cmpxchg
>       jne    loop
>
> under -mrelax-cmpxchg-loop.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
>
> OK for master?
>
> gcc/ChangeLog:
>
>         PR target/103069
>         * config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop):
>         Split atomic fetch and loop part.
>         (ix86_expand_cmpxchg_loop): New expander for cmpxchg loop.
>         * config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New
>         prototype.
>         * config/i386/sync.md (atomic_compare_and_swap<mode>): Call new
>         expander under TARGET_RELAX_CMPXCHG_LOOP.
>         (atomic_compare_and_swap<mode>): Likewise for doubleword modes.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/103069
>         * gcc.target/i386/pr103069-2.c: Adjust result check.
>         * gcc.target/i386/pr103069-3.c: New test.
>         * gcc.target/i386/pr103069-4.c: Likewise.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc             | 153 +++++++++++++++------
>  gcc/config/i386/i386-protos.h              |   2 +
>  gcc/config/i386/sync.md                    |  65 +++++----
>  gcc/testsuite/gcc.target/i386/pr103069-2.c |   4 +-
>  gcc/testsuite/gcc.target/i386/pr103069-3.c |  24 ++++
>  gcc/testsuite/gcc.target/i386/pr103069-4.c |  43 ++++++
>  6 files changed, 226 insertions(+), 65 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-4.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index ce9607e36de..6cf1a0b9cb6 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -23203,16 +23203,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
>                                        enum rtx_code code, bool after,
>                                        bool doubleword)
>  {
> -  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
> -  rtx_code_label *loop_label, *pause_label, *done_label;
> +  rtx old_reg, new_reg, old_mem, success;
>    machine_mode mode = GET_MODE (target);
> +  rtx_code_label *loop_label = NULL;
>
>    old_reg = gen_reg_rtx (mode);
>    new_reg = old_reg;
> -  loop_label = gen_label_rtx ();
> -  pause_label = gen_label_rtx ();
> -  done_label = gen_label_rtx ();
>    old_mem = copy_to_reg (mem);
> +  loop_label = gen_label_rtx ();
>    emit_label (loop_label);
>    emit_move_insn (old_reg, old_mem);
>
> @@ -23234,50 +23232,125 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
>    if (after)
>      emit_move_insn (target, new_reg);
>
> -  /* Load memory again inside loop.  */
> -  new_mem = copy_to_reg (mem);
> -  /* Compare mem value with expected value.  */
> +  success = NULL_RTX;
> +
> +  ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
> +                           gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
> +                                         SImode),
> +                           doubleword, loop_label);
> +}
> +
> +/* Relax cmpxchg instruction, param loop_label indicates whether
> +   the instruction should be relaxed with a pause loop.  If not,
> +   it will be relaxed to an atomic load + compare, and skip
> +   cmpxchg instruction if mem != exp_input.  */
> +
> +void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
> +                              rtx mem, rtx exp_input, rtx new_input,
> +                              rtx mem_model, bool doubleword,
> +                              rtx_code_label *loop_label)
> +{
> +  rtx_code_label *cmp_label = NULL;
> +  rtx_code_label *done_label = NULL;
> +  rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
> +  rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
> +  rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
> +  machine_mode mode = GET_MODE (target_val), hmode = mode;
> +
> +  if (*ptarget_bool == NULL)
> +    target_bool = gen_reg_rtx (QImode);
> +  else
> +    target_bool = *ptarget_bool;
> +
> +  cmp_label = gen_label_rtx ();
> +  done_label = gen_label_rtx ();
> +
> +  new_mem = gen_reg_rtx (mode);
> +  /* Load memory first.  */
> +  expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
> +
> +  switch (mode)
> +    {
> +    case TImode:
> +      gendw = gen_atomic_compare_and_swapti_doubleword;
> +      hmode = DImode;
> +      break;
> +    case DImode:
> +      if (doubleword)
> +       {
> +         gendw = gen_atomic_compare_and_swapdi_doubleword;
> +         hmode = SImode;
> +       }
> +      else
> +       gen = gen_atomic_compare_and_swapdi_1;
> +      break;
> +    case SImode:
> +      gen = gen_atomic_compare_and_swapsi_1; break;
> +    case HImode:
> +      gen = gen_atomic_compare_and_swaphi_1; break;
> +    case QImode:
> +      gen = gen_atomic_compare_and_swapqi_1; break;
> +    default:
> +      gcc_unreachable ();
> +    }
>
> +  /* Compare mem value with expected value.  */
>    if (doubleword)
>      {
> -      machine_mode half_mode = (mode == DImode)? SImode : DImode;
> -      rtx low_new_mem = gen_lowpart (half_mode, new_mem);
> -      rtx low_old_mem = gen_lowpart (half_mode, old_mem);
> -      rtx high_new_mem = gen_highpart (half_mode, new_mem);
> -      rtx high_old_mem = gen_highpart (half_mode, old_mem);
> -      emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
> -                              half_mode, 1, pause_label,
> +      rtx low_new_mem = gen_lowpart (hmode, new_mem);
> +      rtx low_exp_input = gen_lowpart (hmode, exp_input);
> +      rtx high_new_mem = gen_highpart (hmode, new_mem);
> +      rtx high_exp_input = gen_highpart (hmode, exp_input);
> +      emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
> +                              hmode, 1, cmp_label,
>                                profile_probability::guessed_never ());
> -      emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
> -                              half_mode, 1, pause_label,
> +      emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
> +                              hmode, 1, cmp_label,
>                                profile_probability::guessed_never ());
>      }
>    else
> -    emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
> -                            GET_MODE (old_mem), 1, pause_label,
> +    emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
> +                            GET_MODE (exp_input), 1, cmp_label,
>                              profile_probability::guessed_never ());
>
> -  success = NULL_RTX;
> -  oldval = old_mem;
> -  expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
> -                                 new_reg, false, MEMMODEL_SYNC_SEQ_CST,
> -                                 MEMMODEL_RELAXED);
> -  if (oldval != old_mem)
> -    emit_move_insn (old_mem, oldval);
> -
> -  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
> -                          GET_MODE (success), 1, loop_label,
> -                          profile_probability::guessed_never ());
> -
> -  emit_jump_insn (gen_jump (done_label));
> -  emit_barrier ();
> -
> -  /* If mem is not expected, pause and loop back.  */
> -  emit_label (pause_label);
> -  emit_insn (gen_pause ());
> -  emit_jump_insn (gen_jump (loop_label));
> -  emit_barrier ();
> -  emit_label (done_label);
> +  /* Directly emits cmpxchg here.  */
> +  if (doubleword)
> +    emit_insn (gendw (target_val, mem, exp_input,
> +                     gen_lowpart (hmode, new_input),
> +                     gen_highpart (hmode, new_input),
> +                     mem_model));
> +  else
> +    emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
> +
> +  if (!loop_label)
> +  {
> +    emit_jump_insn (gen_jump (done_label));
> +    emit_barrier ();
> +    emit_label (cmp_label);
> +    emit_move_insn (target_val, new_mem);
> +    emit_label (done_label);
> +    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                      const0_rtx);
> +  }
> +  else
> +  {
> +    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                      const0_rtx);
> +    emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
> +                            GET_MODE (target_bool), 1, loop_label,
> +                            profile_probability::guessed_never ());
> +    emit_jump_insn (gen_jump (done_label));
> +    emit_barrier ();
> +
> +    /* If mem is not expected, pause and loop back.  */
> +    emit_label (cmp_label);
> +    emit_insn (gen_pause ());
> +    emit_jump_insn (gen_jump (loop_label));
> +    emit_barrier ();
> +    emit_label (done_label);
> +  }
> +
> +  *ptarget_bool = target_bool;
>  }
>
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index b7e9aa75d25..d5e11259a5a 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -221,6 +221,8 @@ extern void ix86_split_mmx_punpck (rtx[], bool);
>  extern void ix86_expand_avx_vzeroupper (void);
>  extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
>                                               bool, bool);
> +extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
> +                                     bool, rtx_code_label *);
>
>  #ifdef TREE_CODE
>  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
> diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
> index 36417c54c11..820e9ca911a 100644
> --- a/gcc/config/i386/sync.md
> +++ b/gcc/config/i386/sync.md
> @@ -373,11 +373,20 @@ (define_expand "atomic_compare_and_swap<mode>"
>     (match_operand:SI 7 "const_int_operand")]   ;; failure model
>    "TARGET_CMPXCHG"
>  {
> -  emit_insn
> -   (gen_atomic_compare_and_swap<mode>_1
> -    (operands[1], operands[2], operands[3], operands[4], operands[6]));
> -  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> -                    const0_rtx);
> +  if (TARGET_RELAX_CMPXCHG_LOOP)
> +  {
> +    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
> +                             operands[3], operands[4], operands[6],
> +                             false, NULL);
> +  }
> +  else
> +  {
> +    emit_insn
> +      (gen_atomic_compare_and_swap<mode>_1
> +       (operands[1], operands[2], operands[3], operands[4], operands[6]));
> +      ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                       const0_rtx);
> +  }
>    DONE;
>  })
>
> @@ -397,25 +406,35 @@ (define_expand "atomic_compare_and_swap<mode>"
>     (match_operand:SI 7 "const_int_operand")]   ;; failure model
>    "TARGET_CMPXCHG"
>  {
> -  if (<MODE>mode == DImode && TARGET_64BIT)
> -    {
> -      emit_insn
> -       (gen_atomic_compare_and_swapdi_1
> -       (operands[1], operands[2], operands[3], operands[4], operands[6]));
> -    }
> +  int doubleword = !(<MODE>mode == DImode && TARGET_64BIT);
> +  if (TARGET_RELAX_CMPXCHG_LOOP)
> +  {
> +    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
> +                             operands[3], operands[4], operands[6],
> +                             doubleword, NULL);
> +  }
>    else
> -    {
> -      machine_mode hmode = <CASHMODE>mode;
> -
> -      emit_insn
> -       (gen_atomic_compare_and_swap<mode>_doubleword
> -        (operands[1], operands[2], operands[3],
> -        gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
> -        operands[6]));
> -    }
> -
> -  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> -                    const0_rtx);
> +  {
> +    if (!doubleword)
> +      {
> +       emit_insn
> +         (gen_atomic_compare_and_swapdi_1
> +          (operands[1], operands[2], operands[3], operands[4], operands[6]));
> +      }
> +    else
> +      {
> +       machine_mode hmode = <CASHMODE>mode;
> +
> +       emit_insn
> +         (gen_atomic_compare_and_swap<mode>_doubleword
> +          (operands[1], operands[2], operands[3],
> +           gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
> +           operands[6]));
> +      }
> +
> +    ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                      const0_rtx);
> +  }
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c b/gcc/testsuite/gcc.target/i386/pr103069-2.c
> index b3f2235fd55..2ceb54e013a 100644
> --- a/gcc/testsuite/gcc.target/i386/pr103069-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c
> @@ -40,12 +40,12 @@ FUNC_ATOMIC_RELAX (char, xor)
>    TYPE c = 11, d = 101;        \
>    res = relax_##TYPE##_##OP##_fetch (&a, b); \
>    exp = f_##TYPE##_##OP##_fetch (&c, d);  \
> -  if (res != exp) \
> +  if (res != exp || a != c) \
>      abort (); \
>    a = c = 21, b = d = 92; \
>    res = relax_##TYPE##_fetch_##OP (&a, b); \
>    exp = f_##TYPE##_fetch_##OP (&c, d);  \
> -  if (res != exp) \
> +  if (res != exp || a != c) \
>      abort (); \
>  }
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103069-3.c b/gcc/testsuite/gcc.target/i386/pr103069-3.c
> new file mode 100644
> index 00000000000..fee708905ec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103069-3.c
> @@ -0,0 +1,24 @@
> +/* PR target/103068 */
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic -mrelax-cmpxchg-loop" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC_CMPXCHG(TYPE) \
> +__attribute__ ((noinline, noclone))    \
> +TYPE f_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval)  \
> +{ \
> +  do  \
> +  { \
> +    newval = oldval | 1;  \
> +  } while (! __atomic_compare_exchange_n (lock, &oldval, newval,  \
> +                                         0, __ATOMIC_RELEASE,  \
> +                                         __ATOMIC_RELAXED));  \
> +  return *lock;        \
> +}
> +
> +
> +FUNC_CMPXCHG (int64_t)
> +FUNC_CMPXCHG (int)
> +FUNC_CMPXCHG (short)
> +FUNC_CMPXCHG (char)
> diff --git a/gcc/testsuite/gcc.target/i386/pr103069-4.c b/gcc/testsuite/gcc.target/i386/pr103069-4.c
> new file mode 100644
> index 00000000000..5abcbca2a98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103069-4.c
> @@ -0,0 +1,43 @@
> +/* PR target/103069 */
> +/* { dg-do run } */
> +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */
> +
> +#include <stdlib.h>
> +#include "pr103069-3.c"
> +
> +#define FUNC_CMPXCHG_RELAX(TYPE) \
> +__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))     \
> +TYPE relax_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval)  \
> +{ \
> +  do  \
> +  { \
> +    newval = oldval | 1;  \
> +  } while (! __atomic_compare_exchange_n (lock, &oldval, newval,  \
> +                                         0, __ATOMIC_RELEASE,  \
> +                                         __ATOMIC_RELAXED));  \
> +  return *lock;        \
> +}
> +
> +FUNC_CMPXCHG_RELAX (int64_t)
> +FUNC_CMPXCHG_RELAX (int)
> +FUNC_CMPXCHG_RELAX (short)
> +FUNC_CMPXCHG_RELAX (char)
> +
> +#define TEST_CMPXCHG_LOOP(TYPE)        \
> +{ \
> +  TYPE a = 11, b = 20, c = 11, res, exp; \
> +  TYPE d = 11, e = 20, f = 11; \
> +  res = relax_##TYPE##_cmpxchg (&a, b, c); \
> +  exp = f_##TYPE##_cmpxchg (&d, e, f); \
> +  if (res != exp || a != d) \
> +    abort (); \
> +}
> +
> +int main (void)
> +{
> +  TEST_CMPXCHG_LOOP (int64_t)
> +  TEST_CMPXCHG_LOOP (int)
> +  TEST_CMPXCHG_LOOP (short)
> +  TEST_CMPXCHG_LOOP (char)
> +  return 0;
> +}
> --
> 2.18.1
>
  

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ce9607e36de..6cf1a0b9cb6 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23203,16 +23203,14 @@  void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
 				       enum rtx_code code, bool after,
 				       bool doubleword)
 {
-  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
-  rtx_code_label *loop_label, *pause_label, *done_label;
+  rtx old_reg, new_reg, old_mem, success;
   machine_mode mode = GET_MODE (target);
+  rtx_code_label *loop_label = NULL;
 
   old_reg = gen_reg_rtx (mode);
   new_reg = old_reg;
-  loop_label = gen_label_rtx ();
-  pause_label = gen_label_rtx ();
-  done_label = gen_label_rtx ();
   old_mem = copy_to_reg (mem);
+  loop_label = gen_label_rtx ();
   emit_label (loop_label);
   emit_move_insn (old_reg, old_mem);
 
@@ -23234,50 +23232,125 @@  void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
   if (after)
     emit_move_insn (target, new_reg);
 
-  /* Load memory again inside loop.  */
-  new_mem = copy_to_reg (mem);
-  /* Compare mem value with expected value.  */
+  success = NULL_RTX;
+
+  ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
+			    gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
+					  SImode),
+			    doubleword, loop_label);
+}
+
+/* Relax cmpxchg instruction, param loop_label indicates whether
+   the instruction should be relaxed with a pause loop.  If not,
+   it will be relaxed to an atomic load + compare, and skip
+   cmpxchg instruction if mem != exp_input.  */
+
+void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
+			       rtx mem, rtx exp_input, rtx new_input,
+			       rtx mem_model, bool doubleword,
+			       rtx_code_label *loop_label)
+{
+  rtx_code_label *cmp_label = NULL;
+  rtx_code_label *done_label = NULL;
+  rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
+  rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
+  machine_mode mode = GET_MODE (target_val), hmode = mode;
+
+  if (*ptarget_bool == NULL)
+    target_bool = gen_reg_rtx (QImode);
+  else
+    target_bool = *ptarget_bool;
+
+  cmp_label = gen_label_rtx ();
+  done_label = gen_label_rtx ();
+
+  new_mem = gen_reg_rtx (mode);
+  /* Load memory first.  */
+  expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
+
+  switch (mode)
+    {
+    case TImode:
+      gendw = gen_atomic_compare_and_swapti_doubleword;
+      hmode = DImode;
+      break;
+    case DImode:
+      if (doubleword)
+	{
+	  gendw = gen_atomic_compare_and_swapdi_doubleword;
+	  hmode = SImode;
+	}
+      else
+	gen = gen_atomic_compare_and_swapdi_1;
+      break;
+    case SImode:
+      gen = gen_atomic_compare_and_swapsi_1; break;
+    case HImode:
+      gen = gen_atomic_compare_and_swaphi_1; break;
+    case QImode:
+      gen = gen_atomic_compare_and_swapqi_1; break;
+    default:
+      gcc_unreachable ();
+    }
 
+  /* Compare mem value with expected value.  */
   if (doubleword)
     {
-      machine_mode half_mode = (mode == DImode)? SImode : DImode;
-      rtx low_new_mem = gen_lowpart (half_mode, new_mem);
-      rtx low_old_mem = gen_lowpart (half_mode, old_mem);
-      rtx high_new_mem = gen_highpart (half_mode, new_mem);
-      rtx high_old_mem = gen_highpart (half_mode, old_mem);
-      emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
-			       half_mode, 1, pause_label,
+      rtx low_new_mem = gen_lowpart (hmode, new_mem);
+      rtx low_exp_input = gen_lowpart (hmode, exp_input);
+      rtx high_new_mem = gen_highpart (hmode, new_mem);
+      rtx high_exp_input = gen_highpart (hmode, exp_input);
+      emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
+			       hmode, 1, cmp_label,
 			       profile_probability::guessed_never ());
-      emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
-			       half_mode, 1, pause_label,
+      emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
+			       hmode, 1, cmp_label,
 			       profile_probability::guessed_never ());
     }
   else
-    emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
-			     GET_MODE (old_mem), 1, pause_label,
+    emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
+			     GET_MODE (exp_input), 1, cmp_label,
 			     profile_probability::guessed_never ());
 
-  success = NULL_RTX;
-  oldval = old_mem;
-  expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
-				  new_reg, false, MEMMODEL_SYNC_SEQ_CST,
-				  MEMMODEL_RELAXED);
-  if (oldval != old_mem)
-    emit_move_insn (old_mem, oldval);
-
-  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
-			   GET_MODE (success), 1, loop_label,
-			   profile_probability::guessed_never ());
-
-  emit_jump_insn (gen_jump (done_label));
-  emit_barrier ();
-
-  /* If mem is not expected, pause and loop back.  */
-  emit_label (pause_label);
-  emit_insn (gen_pause ());
-  emit_jump_insn (gen_jump (loop_label));
-  emit_barrier ();
-  emit_label (done_label);
+  /* Directly emits cmpxchg here.  */
+  if (doubleword)
+    emit_insn (gendw (target_val, mem, exp_input,
+		      gen_lowpart (hmode, new_input),
+		      gen_highpart (hmode, new_input),
+		      mem_model));
+  else
+    emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
+
+  if (!loop_label)
+  {
+    emit_jump_insn (gen_jump (done_label));
+    emit_barrier ();
+    emit_label (cmp_label);
+    emit_move_insn (target_val, new_mem);
+    emit_label (done_label);
+    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		       const0_rtx);
+  }
+  else
+  {
+    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		       const0_rtx);
+    emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
+			     GET_MODE (target_bool), 1, loop_label,
+			     profile_probability::guessed_never ());
+    emit_jump_insn (gen_jump (done_label));
+    emit_barrier ();
+
+    /* If mem is not expected, pause and loop back.  */
+    emit_label (cmp_label);
+    emit_insn (gen_pause ());
+    emit_jump_insn (gen_jump (loop_label));
+    emit_barrier ();
+    emit_label (done_label);
+  }
+
+  *ptarget_bool = target_bool;
 }
 
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index b7e9aa75d25..d5e11259a5a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -221,6 +221,8 @@  extern void ix86_split_mmx_punpck (rtx[], bool);
 extern void ix86_expand_avx_vzeroupper (void);
 extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
 					      bool, bool);
+extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
+				      bool, rtx_code_label *);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 36417c54c11..820e9ca911a 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -373,11 +373,20 @@  (define_expand "atomic_compare_and_swap<mode>"
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn
-   (gen_atomic_compare_and_swap<mode>_1
-    (operands[1], operands[2], operands[3], operands[4], operands[6]));
-  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
-		     const0_rtx);
+  if (TARGET_RELAX_CMPXCHG_LOOP)
+  {
+    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
+			      operands[3], operands[4], operands[6],
+			      false, NULL);
+  }
+  else
+  {
+    emit_insn
+      (gen_atomic_compare_and_swap<mode>_1
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
+      ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+			const0_rtx);
+  }
   DONE;
 })
 
@@ -397,25 +406,35 @@  (define_expand "atomic_compare_and_swap<mode>"
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  if (<MODE>mode == DImode && TARGET_64BIT)
-    {
-      emit_insn
-       (gen_atomic_compare_and_swapdi_1
-	(operands[1], operands[2], operands[3], operands[4], operands[6]));
-    }
+  int doubleword = !(<MODE>mode == DImode && TARGET_64BIT);
+  if (TARGET_RELAX_CMPXCHG_LOOP)
+  {
+    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
+			      operands[3], operands[4], operands[6],
+			      doubleword, NULL);
+  }
   else
-    {
-      machine_mode hmode = <CASHMODE>mode;
-
-      emit_insn
-       (gen_atomic_compare_and_swap<mode>_doubleword
-        (operands[1], operands[2], operands[3],
-	 gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
-	 operands[6]));
-    }
-
-  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
-		     const0_rtx);
+  {
+    if (!doubleword)
+      {
+	emit_insn
+	  (gen_atomic_compare_and_swapdi_1
+	   (operands[1], operands[2], operands[3], operands[4], operands[6]));
+      }
+    else
+      {
+	machine_mode hmode = <CASHMODE>mode;
+
+	emit_insn
+	  (gen_atomic_compare_and_swap<mode>_doubleword
+	   (operands[1], operands[2], operands[3],
+	    gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
+	    operands[6]));
+      }
+
+    ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		       const0_rtx);
+  }
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c b/gcc/testsuite/gcc.target/i386/pr103069-2.c
index b3f2235fd55..2ceb54e013a 100644
--- a/gcc/testsuite/gcc.target/i386/pr103069-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c
@@ -40,12 +40,12 @@  FUNC_ATOMIC_RELAX (char, xor)
   TYPE c = 11, d = 101;	\
   res = relax_##TYPE##_##OP##_fetch (&a, b); \
   exp = f_##TYPE##_##OP##_fetch (&c, d);  \
-  if (res != exp) \
+  if (res != exp || a != c) \
     abort (); \
   a = c = 21, b = d = 92; \
   res = relax_##TYPE##_fetch_##OP (&a, b); \
   exp = f_##TYPE##_fetch_##OP (&c, d);  \
-  if (res != exp) \
+  if (res != exp || a != c) \
     abort (); \
 }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-3.c b/gcc/testsuite/gcc.target/i386/pr103069-3.c
new file mode 100644
index 00000000000..fee708905ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103069-3.c
@@ -0,0 +1,24 @@ 
+/* PR target/103068 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic -mrelax-cmpxchg-loop" } */ 
+
+#include <stdint.h>
+
+#define FUNC_CMPXCHG(TYPE) \
+__attribute__ ((noinline, noclone))	\
+TYPE f_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval)  \
+{ \
+  do  \
+  { \
+    newval = oldval | 1;  \
+  } while (! __atomic_compare_exchange_n (lock, &oldval, newval,  \
+					  0, __ATOMIC_RELEASE,  \
+					  __ATOMIC_RELAXED));  \
+  return *lock;	\
+}
+
+
+FUNC_CMPXCHG (int64_t)
+FUNC_CMPXCHG (int)
+FUNC_CMPXCHG (short)
+FUNC_CMPXCHG (char)
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-4.c b/gcc/testsuite/gcc.target/i386/pr103069-4.c
new file mode 100644
index 00000000000..5abcbca2a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103069-4.c
@@ -0,0 +1,43 @@ 
+/* PR target/103069 */
+/* { dg-do run } */
+/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */ 
+
+#include <stdlib.h>
+#include "pr103069-3.c"
+
+#define FUNC_CMPXCHG_RELAX(TYPE) \
+__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))	\
+TYPE relax_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval)  \
+{ \
+  do  \
+  { \
+    newval = oldval | 1;  \
+  } while (! __atomic_compare_exchange_n (lock, &oldval, newval,  \
+					  0, __ATOMIC_RELEASE,  \
+					  __ATOMIC_RELAXED));  \
+  return *lock;	\
+}
+
+FUNC_CMPXCHG_RELAX (int64_t)
+FUNC_CMPXCHG_RELAX (int)
+FUNC_CMPXCHG_RELAX (short)
+FUNC_CMPXCHG_RELAX (char)
+
+#define TEST_CMPXCHG_LOOP(TYPE)	\
+{ \
+  TYPE a = 11, b = 20, c = 11, res, exp; \
+  TYPE d = 11, e = 20, f = 11;	\
+  res = relax_##TYPE##_cmpxchg (&a, b, c); \
+  exp = f_##TYPE##_cmpxchg (&d, e, f); \
+  if (res != exp || a != d) \
+    abort (); \
+}
+
+int main (void)
+{
+  TEST_CMPXCHG_LOOP (int64_t)
+  TEST_CMPXCHG_LOOP (int)
+  TEST_CMPXCHG_LOOP (short)
+  TEST_CMPXCHG_LOOP (char)
+  return 0;
+}