diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 088e6af2258a56bb4eeb1b0402624a48d080d88a..3e4de64ec24365b8c7c6be2cb6e1fecd0c2cf638 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -23138,4 +23138,80 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   *rem_p = rem;
 }
 
+void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
+				       enum rtx_code code, bool after,
+				       bool doubleword)
+{
+  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
+  rtx_code_label *loop_label, *pause_label;
+  machine_mode mode = GET_MODE (target);
+
+  old_reg = gen_reg_rtx (mode);
+  new_reg = old_reg;
+  loop_label = gen_label_rtx ();
+  pause_label = gen_label_rtx ();
+  old_mem = copy_to_reg (mem);
+  emit_label (loop_label);
+  emit_move_insn (old_reg, old_mem);
+
+  /* return value for atomic_fetch_op.  */
+  if (!after)
+    emit_move_insn (target, old_reg);
+
+  if (code == NOT)
+    {
+      new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
+				     true, OPTAB_LIB_WIDEN);
+      new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
+    }
+  else
+    new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
+				   true, OPTAB_LIB_WIDEN);
+
+  /* return value for atomic_op_fetch.  */
+  if (after)
+    emit_move_insn (target, new_reg);
+
+  /* Load memory again inside loop.  */
+  new_mem = copy_to_reg (mem);
+  /* Compare mem value with expected value.  */
+
+  if (doubleword)
+    {
+      machine_mode half_mode = (mode == DImode)? SImode : DImode;
+      rtx low_new_mem = gen_lowpart (half_mode, new_mem);
+      rtx low_old_mem = gen_lowpart (half_mode, old_mem);
+      rtx high_new_mem = gen_highpart (half_mode, new_mem);
+      rtx high_old_mem = gen_highpart (half_mode, old_mem);
+      emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
+			       half_mode, 1, pause_label,
+			       profile_probability::guessed_never ());
+      emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
+			       half_mode, 1, pause_label,
+			       profile_probability::guessed_never ());
+    }
+  else
+    emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
+			     GET_MODE (old_mem), 1, pause_label,
+			     profile_probability::guessed_never ());
+
+  success = NULL_RTX;
+  oldval = old_mem;
+  expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
+				  new_reg, false, MEMMODEL_SYNC_SEQ_CST,
+				  MEMMODEL_RELAXED);
+  if (oldval != old_mem)
+    emit_move_insn (old_mem, oldval);
+
+  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
+			   GET_MODE (success), 1, loop_label,
+			   profile_probability::guessed_never ());
+
+  /* If mem is not expected, pause and loop back.  */
+  emit_label (pause_label);
+  emit_insn (gen_pause ());
+  emit_jump_insn (gen_jump (loop_label));
+  emit_barrier ();
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index a8cc0664f11cb2f20eb967f7409e0620e638a93f..feff2584f41a2c5b7e501d3e8415b8d29708e056 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
     { "-mstv",				MASK_STV },
     { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
     { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
-    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
+    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES },
+    { "-mrelax-cmpxchg-loop",		MASK_RELAX_CMPXCHG_LOOP }
   };
 
   /* Additional flag options.  */
@@ -1092,6 +1093,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
+
+    IX86_ATTR_YES ("relax-cmpxchg-loop",
+		   OPT_mrelax_cmpxchg_loop,
+		   MASK_RELAX_CMPXCHG_LOOP),
   };
 
   location_t loc
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bd52450a148eb84c67c49a409409a9b550856819..7e05510c679d054b08c5ddac6c85d532940b7ec2 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -217,6 +217,8 @@ extern void ix86_move_vector_high_sse_to_mmx (rtx);
 extern void ix86_split_mmx_pack (rtx[], enum rtx_code);
 extern void ix86_split_mmx_punpck (rtx[], bool);
 extern void ix86_expand_avx_vzeroupper (void);
+extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
+					      bool, bool);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index ad366974b5b867c437b5c39983bcf5bab2b30013..46fad3cc03808b170514585ea17e152c3b3149d1 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -404,6 +404,10 @@ momit-leaf-frame-pointer
 Target Mask(OMIT_LEAF_FRAME_POINTER) Save
 Omit the frame pointer in leaf functions.
 
+mrelax-cmpxchg-loop
+Target Mask(RELAX_CMPXCHG_LOOP) Save
+Relax cmpxchg loop for atomic_fetch_{or,xor,and,nand} by adding load and cmp before cmpxchg, execute pause and loop back to load and compare if load value is not expected.
+
 mpc32
 Target RejectNegative
 Set 80387 floating-point precision to 32-bit.
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 9716a0b2f2c1b95ca353a5a212fe07322e20402c..cc4fe727bd908379044d286a0bd425cdd1c5fbec 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -525,6 +525,123 @@
 	      (set (reg:CCZ FLAGS_REG)
 		   (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])])
 
+(define_expand "atomic_fetch_<logic><mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (any_logic:SWI124
+    (match_operand:SWI124 1 "memory_operand")
+    (match_operand:SWI124 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], <CODE>, false,
+				    false);
+  DONE;
+})
+
+(define_expand "atomic_<logic>_fetch<mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (any_logic:SWI124
+    (match_operand:SWI124 1 "memory_operand")
+    (match_operand:SWI124 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], <CODE>, true,
+				    false);
+  DONE;
+})
+
+(define_expand "atomic_fetch_nand<mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (match_operand:SWI124 1 "memory_operand")
+   (match_operand:SWI124 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], NOT, false,
+				    false);
+  DONE;
+})
+
+(define_expand "atomic_nand_fetch<mode>"
+  [(match_operand:SWI124 0 "register_operand")
+   (match_operand:SWI124 1 "memory_operand")
+   (match_operand:SWI124 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], NOT, true,
+				    false);
+  DONE;
+})
+
+(define_expand "atomic_fetch_<logic><mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (any_logic:CASMODE
+    (match_operand:CASMODE 1 "memory_operand")
+    (match_operand:CASMODE 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+		    || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], <CODE>, false,
+				    doubleword);
+  DONE;
+})
+
+(define_expand "atomic_<logic>_fetch<mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (any_logic:CASMODE
+    (match_operand:CASMODE 1 "memory_operand")
+    (match_operand:CASMODE 2 "register_operand"))
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+		    || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], <CODE>, true,
+				    doubleword);
+  DONE;
+})
+
+(define_expand "atomic_fetch_nand<mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (match_operand:CASMODE 1 "memory_operand")
+   (match_operand:CASMODE 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+		    || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], NOT, false,
+				    doubleword);
+  DONE;
+})
+
+(define_expand "atomic_nand_fetch<mode>"
+  [(match_operand:CASMODE 0 "register_operand")
+   (match_operand:CASMODE 1 "memory_operand")
+   (match_operand:CASMODE 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP"
+{
+  bool doubleword = (<MODE>mode == DImode && !TARGET_64BIT)
+		    || (<MODE>mode == TImode);
+  ix86_expand_atomic_fetch_op_loop (operands[0], operands[1],
+				    operands[2], NOT, true,
+				    doubleword);
+  DONE;
+})
+
+
 ;; For operand 2 nonmemory_operand predicate is used instead of
 ;; register_operand to allow combiner to better optimize atomic
 ;; additions of constants.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2d9c1782f332433b7949b74c37e15b9df42a04d8..6070288856c02bc6e76ad1496424332a58719e78 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1423,7 +1423,7 @@ See RS/6000 and PowerPC Options.
 -mstack-protector-guard-reg=@var{reg} @gol
 -mstack-protector-guard-offset=@var{offset} @gol
 -mstack-protector-guard-symbol=@var{symbol} @gol
--mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
+-mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
 -mindirect-branch-register -mneeded}
 
@@ -32330,6 +32330,13 @@ Generate code that uses only the general-purpose registers.  This
 prevents the compiler from using floating-point, vector, mask and bound
 registers.
 
+@item -mrelax-cmpxchg-loop
+@opindex mrelax-cmpxchg-loop
+Relax cmpxchg loop by emitting an early load and compare before cmpxchg,
+execute pause if load value is not expected. This reduces excessive
+cachline bouncing when and works for all atomic logic fetch builtins
+that generates compare and swap loop.
+
 @item -mindirect-branch=@var{choice}
 @opindex mindirect-branch
 Convert indirect call and jump with @var{choice}.  The default is
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-1.c b/gcc/testsuite/gcc.target/i386/pr103069-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f819af4409c5080fccfe9249c176913c306349e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103069-1.c
@@ -0,0 +1,35 @@
+/* PR target/103068 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic -mrelax-cmpxchg-loop" } */ 
+/* { dg-final { scan-assembler-times "rep;?\[ \\t\]+nop" 32 } } */
+
+#include <stdint.h>
+
+#define FUNC_ATOMIC(TYPE, OP) \
+__attribute__ ((noinline, noclone))	\
+TYPE f_##TYPE##_##OP##_fetch (TYPE *a, TYPE b)	\
+{ \
+  return __atomic_##OP##_fetch (a, b, __ATOMIC_RELAXED);  \
+} \
+__attribute__ ((noinline, noclone))	\
+TYPE f_##TYPE##_fetch_##OP (TYPE *a, TYPE b)	\
+{ \
+  return __atomic_fetch_##OP (a, b, __ATOMIC_RELAXED);  \
+}
+
+FUNC_ATOMIC (int64_t, and)
+FUNC_ATOMIC (int64_t, nand)
+FUNC_ATOMIC (int64_t, or)
+FUNC_ATOMIC (int64_t, xor)
+FUNC_ATOMIC (int, and)
+FUNC_ATOMIC (int, nand)
+FUNC_ATOMIC (int, or)
+FUNC_ATOMIC (int, xor)
+FUNC_ATOMIC (short, and)
+FUNC_ATOMIC (short, nand)
+FUNC_ATOMIC (short, or)
+FUNC_ATOMIC (short, xor)
+FUNC_ATOMIC (char, and)
+FUNC_ATOMIC (char, nand)
+FUNC_ATOMIC (char, or)
+FUNC_ATOMIC (char, xor)
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c b/gcc/testsuite/gcc.target/i386/pr103069-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..8ac824cc8e8f4e09d6a41f792cf1d82236bcd713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c
@@ -0,0 +1,70 @@
+/* PR target/103068 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */ 
+
+#include <stdlib.h>
+#include "pr103069-1.c"
+
+#define FUNC_ATOMIC_RELAX(TYPE, OP) \
+__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))	\
+TYPE relax_##TYPE##_##OP##_fetch (TYPE *a, TYPE b)	\
+{ \
+  return __atomic_##OP##_fetch (a, b, __ATOMIC_RELAXED);  \
+} \
+__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))	\
+TYPE relax_##TYPE##_fetch_##OP (TYPE *a, TYPE b)	\
+{ \
+  return __atomic_fetch_##OP (a, b, __ATOMIC_RELAXED);  \
+}
+
+FUNC_ATOMIC_RELAX (int64_t, and)
+FUNC_ATOMIC_RELAX (int64_t, nand)
+FUNC_ATOMIC_RELAX (int64_t, or)
+FUNC_ATOMIC_RELAX (int64_t, xor)
+FUNC_ATOMIC_RELAX (int, and)
+FUNC_ATOMIC_RELAX (int, nand)
+FUNC_ATOMIC_RELAX (int, or)
+FUNC_ATOMIC_RELAX (int, xor)
+FUNC_ATOMIC_RELAX (short, and)
+FUNC_ATOMIC_RELAX (short, nand)
+FUNC_ATOMIC_RELAX (short, or)
+FUNC_ATOMIC_RELAX (short, xor)
+FUNC_ATOMIC_RELAX (char, and)
+FUNC_ATOMIC_RELAX (char, nand)
+FUNC_ATOMIC_RELAX (char, or)
+FUNC_ATOMIC_RELAX (char, xor)
+
+#define TEST_ATOMIC_FETCH_LOGIC(TYPE, OP) \
+{ \
+  TYPE a = 11, b = 101, res, exp; \
+  res = relax_##TYPE##_##OP##_fetch (&a, b); \
+  exp = f_##TYPE##_##OP##_fetch (&a, b);  \
+  if (res != exp) \
+    abort (); \
+  a = 21, b = 92; \
+  res = relax_##TYPE##_fetch_##OP (&a, b); \
+  exp = f_##TYPE##_fetch_##OP (&a, b);  \
+  if (res != exp) \
+    abort (); \
+}
+
+int main (void)
+{
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, and)
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, nand)
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, or)
+  TEST_ATOMIC_FETCH_LOGIC (int64_t, xor)
+  TEST_ATOMIC_FETCH_LOGIC (int, and)
+  TEST_ATOMIC_FETCH_LOGIC (int, nand)
+  TEST_ATOMIC_FETCH_LOGIC (int, or)
+  TEST_ATOMIC_FETCH_LOGIC (int, xor)
+  TEST_ATOMIC_FETCH_LOGIC (short, and)
+  TEST_ATOMIC_FETCH_LOGIC (short, nand)
+  TEST_ATOMIC_FETCH_LOGIC (short, or)
+  TEST_ATOMIC_FETCH_LOGIC (short, xor)
+  TEST_ATOMIC_FETCH_LOGIC (char, and)
+  TEST_ATOMIC_FETCH_LOGIC (char, nand)
+  TEST_ATOMIC_FETCH_LOGIC (char, or)
+  TEST_ATOMIC_FETCH_LOGIC (char, xor)
+  return 0;
+}