diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bbecc7a6ddaee..ac3233ec30ee3 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2173,6 +2173,14 @@ class TargetLoweringBase { return false; } + // The memory ordering that AtomicExpandPass should assign to a atomic + // instruction that it has lowered by adding fences. This can be used + // to "fold" one of the fences into the atomic instruction. + virtual AtomicOrdering + atomicOperationOrderAfterFenceSplit(const Instruction *I) const { + return AtomicOrdering::Monotonic; + } + /// Whether AtomicExpandPass should automatically insert a trailing fence /// without reducing the ordering for this atomic. Defaults to false. virtual bool diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index a75fa688d87a8..a3e9700fa3089 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // failure path. As a result, fence insertion is directly done by // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); - CASI->setSuccessOrdering(AtomicOrdering::Monotonic); - CASI->setFailureOrdering(AtomicOrdering::Monotonic); + auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); + + CASI->setSuccessOrdering(CASOrdering); + CASI->setFailureOrdering(CASOrdering); } if (FenceOrdering != AtomicOrdering::Monotonic) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 58ad92a8934a6..f2b4bc3a4b561 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/FPEnv.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" @@ -49,6 +50,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -995,6 +997,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // actions computeRegisterProperties(STI.getRegisterInfo()); + // PTX support for 16-bit CAS is emulated. Only use 32+ setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits()); setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); @@ -5565,6 +5568,70 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { return AtomicExpansionKind::CmpXChg; } +bool NVPTXTargetLowering::shouldInsertFencesForAtomic( + const Instruction *I) const { + auto *CI = dyn_cast(I); + // When CAS bitwidth is not supported on the hardware, the CAS is emulated + // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce + // the memory order using explicit fences around the retry loop. + // The memory order of natively supported CAS operations can be enforced + // by lowering to an atom.cas with the right memory synchronizing effect. + // However, atom.cas only supports relaxed, acquire, release and acq_rel. + // So we also use explicit fences for enforcing memory order for + // seq_cast CAS with natively-supported bitwidths. + return CI && + (cast(CI->getCompareOperand()->getType())->getBitWidth() < + STI.getMinCmpXchgSizeInBits() || + CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent); +} + +AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( + const Instruction *I) const { + auto *CI = dyn_cast(I); + bool BitwidthSupportedAndIsSeqCst = + CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent && + cast(CI->getCompareOperand()->getType())->getBitWidth() >= + STI.getMinCmpXchgSizeInBits(); + return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire + : AtomicOrdering::Monotonic; +} + +Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + if (!isa(Inst)) + return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); + + // Specialize for cmpxchg + // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + if (isReleaseOrStronger(Ord)) + return Ord == AtomicOrdering::SequentiallyConsistent + ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) + : Builder.CreateFence(AtomicOrdering::Release); + + return nullptr; +} + +Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + // Specialize for cmpxchg + if (!isa(Inst)) + return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); + + auto CASWidth = + cast( + dyn_cast(Inst)->getCompareOperand()->getType()) + ->getBitWidth(); + // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated + if (isAcquireOrStronger(Ord) && + (Ord != AtomicOrdering::SequentiallyConsistent || + CASWidth < STI.getMinCmpXchgSizeInBits())) + return Builder.CreateFence(AtomicOrdering::Acquire); + + return nullptr; +} + // Pin NVPTXTargetObjectFile's vtables to this file. NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 5adf69d621552..1ee7a9b9ab8e3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -17,6 +17,7 @@ #include "NVPTX.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Support/AtomicOrdering.h" namespace llvm { namespace NVPTXISD { @@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering { return true; } + bool shouldInsertFencesForAtomic(const Instruction *) const override; + + AtomicOrdering + atomicOperationOrderAfterFenceSplit(const Instruction *I) const override; + + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + private: const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 06c629c01d9ab..ef4c942a49a92 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1962,41 +1962,41 @@ multiclass F_ATOMIC_2_NEG Pred> { let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, regclass:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>, Requires; def imm1 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, regclass:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>, Requires; def imm2 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, IMMType:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>, Requires; def imm3 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, IMMType:$c), - !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>, Requires; } } -multiclass F_ATOMIC_3 Pred = []> { - defm p32 : F_ATOMIC_3_imp; - defm p64 : F_ATOMIC_3_imp; +multiclass F_ATOMIC_3 Pred = []> { + defm p32 : F_ATOMIC_3_imp; + defm p64 : F_ATOMIC_3_imp; } // atom_add @@ -2378,51 +2378,76 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2]>; -// atom_cas - -def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; - -defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3; -defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3; +multiclass ternary_atomic_op_as { + // one record per address space + def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + (!cast(NAME) node:$ptr, node:$cmp, node:$val), + AS_match.generic>; + + def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + (!cast(NAME) node:$ptr, node:$cmp, node:$val), + AS_match.global>; + + def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val), + (!cast(NAME) node:$ptr, node:$cmp, node:$val), + AS_match.shared>; +} + +// generate pattern fragments for size x memory order +// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs +// using larger-bitwidth cas +foreach size = ["i16", "i32", "i64"] in { + foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in { + defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as; + } +} + +// eg. with type = 32, order = ".acquire", addrspace = ".global", +// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. +// preds = [hasSM<70>, hasPTX<63>] +// F_ATOMIC_3, hasPTX<63>]> +multiclass INT_PTX_ATOM_CAS preds> + : F_ATOMIC_3("i"#type), + !cast("Int"#type#"Regs"), + order, + addrspace, + ".b"#type, + ".cas", + !cast(atomic_cmp_swap_pat), + !cast("i"#type#"imm"), + preds>; + +// Define atom.cas for all combinations of size x addrspace x memory order +// supported in PTX *and* on the hardware. +foreach size = ["32", "64"] in { + foreach addrspace = ["generic", "global", "shared"] in { + defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace); + foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { + defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- + // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. + defm INT_PTX_ATOM_CAS_#size#_#order#addrspace + : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, + cas_order_string, cas_addrspace_string, + [hasSM<70>, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace + : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, + "", cas_addrspace_string, []>; + } + } +} + +// Note that 16-bit CAS support in PTX is emulated. +defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 851c9152e4cb8..f893f1aefec84 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // set of equivalent memory operations with a scalar data-type, executed in // an unspecified order on the elements in the vector. unsigned getMaxRequiredAlignment() const { return 8; } - // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS + // Get the smallest cmpxchg word size that the hardware supports. unsigned getMinCmpXchgSizeInBits() const { return 32; } unsigned getPTXVersion() const { return PTXVersion; } diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index 67abfe8295a62..9027bd6a14780 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; ; CHECKPTX71-NEXT: mov.u32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; ; CHECKPTX71-NEXT: mov.u32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; ; CHECKPTX71-NEXT: mov.u32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; ; CHECKPTX71-NEXT: mov.u32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index fccc3a8844ffc..2f58d279f82c3 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -1,47 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %} ; CHECK-LABEL: atom0 define i32 @atom0(ptr %addr, i32 %val) { -; CHECK: atom.add.u32 +; CHECK-LABEL: atom0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom0_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom0_param_1]; +; CHECK-NEXT: atom.add.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw add ptr %addr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom1 define i64 @atom1(ptr %addr, i64 %val) { -; CHECK: atom.add.u64 +; CHECK-LABEL: atom1( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom1_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom1_param_1]; +; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw add ptr %addr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom2 define i32 @atom2(ptr %subr, i32 %val) { -; CHECK: neg.s32 -; CHECK: atom.add.u32 +; CHECK-LABEL: atom2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .s32 temp; +; CHECK-NEXT: neg.s32 temp, %r1; +; CHECK-NEXT: atom.add.u32 %r2, [%rd1], temp; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom3 define i64 @atom3(ptr %subr, i64 %val) { -; CHECK: neg.s64 -; CHECK: atom.add.u64 +; CHECK-LABEL: atom3( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .s64 temp; +; CHECK-NEXT: neg.s64 temp, %rd2; +; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], temp; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom4 define i32 @atom4(ptr %subr, i32 %val) { -; CHECK: atom.and.b32 +; CHECK-LABEL: atom4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom4_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom4_param_1]; +; CHECK-NEXT: atom.and.b32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw and ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom5 define i64 @atom5(ptr %subr, i64 %val) { -; CHECK: atom.and.b64 +; CHECK-LABEL: atom5( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom5_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom5_param_1]; +; CHECK-NEXT: atom.and.b64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw and ptr %subr, i64 %val seq_cst ret i64 %ret } @@ -59,84 +123,198 @@ define i64 @atom5(ptr %subr, i64 %val) { ; CHECK-LABEL: atom8 define i32 @atom8(ptr %subr, i32 %val) { -; CHECK: atom.or.b32 +; CHECK-LABEL: atom8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom8_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom8_param_1]; +; CHECK-NEXT: atom.or.b32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw or ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom9 define i64 @atom9(ptr %subr, i64 %val) { -; CHECK: atom.or.b64 +; CHECK-LABEL: atom9( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom9_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom9_param_1]; +; CHECK-NEXT: atom.or.b64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw or ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom10 define i32 @atom10(ptr %subr, i32 %val) { -; CHECK: atom.xor.b32 +; CHECK-LABEL: atom10( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom10_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom10_param_1]; +; CHECK-NEXT: atom.xor.b32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw xor ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom11 define i64 @atom11(ptr %subr, i64 %val) { -; CHECK: atom.xor.b64 +; CHECK-LABEL: atom11( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom11_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom11_param_1]; +; CHECK-NEXT: atom.xor.b64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw xor ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom12 define i32 @atom12(ptr %subr, i32 %val) { -; CHECK: atom.max.s32 +; CHECK-LABEL: atom12( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom12_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom12_param_1]; +; CHECK-NEXT: atom.max.s32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw max ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom13 define i64 @atom13(ptr %subr, i64 %val) { -; CHECK: atom.max.s64 +; CHECK-LABEL: atom13( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom13_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom13_param_1]; +; CHECK-NEXT: atom.max.s64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw max ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom14 define i32 @atom14(ptr %subr, i32 %val) { -; CHECK: atom.min.s32 +; CHECK-LABEL: atom14( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom14_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom14_param_1]; +; CHECK-NEXT: atom.min.s32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw min ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom15 define i64 @atom15(ptr %subr, i64 %val) { -; CHECK: atom.min.s64 +; CHECK-LABEL: atom15( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom15_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom15_param_1]; +; CHECK-NEXT: atom.min.s64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw min ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom16 define i32 @atom16(ptr %subr, i32 %val) { -; CHECK: atom.max.u32 +; CHECK-LABEL: atom16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom16_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom16_param_1]; +; CHECK-NEXT: atom.max.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw umax ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom17 define i64 @atom17(ptr %subr, i64 %val) { -; CHECK: atom.max.u64 +; CHECK-LABEL: atom17( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom17_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom17_param_1]; +; CHECK-NEXT: atom.max.u64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw umax ptr %subr, i64 %val seq_cst ret i64 %ret } ; CHECK-LABEL: atom18 define i32 @atom18(ptr %subr, i32 %val) { -; CHECK: atom.min.u32 +; CHECK-LABEL: atom18( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom18_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom18_param_1]; +; CHECK-NEXT: atom.min.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %ret = atomicrmw umin ptr %subr, i32 %val seq_cst ret i32 %ret } ; CHECK-LABEL: atom19 define i64 @atom19(ptr %subr, i64 %val) { -; CHECK: atom.min.u64 +; CHECK-LABEL: atom19( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom19_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [atom19_param_1]; +; CHECK-NEXT: atom.min.u64 %rd3, [%rd1], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = atomicrmw umin ptr %subr, i64 %val seq_cst ret i64 %ret } @@ -145,7 +323,17 @@ declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val) ; CHECK-LABEL: atomic_add_f32_generic define float @atomic_add_f32_generic(ptr %addr, float %val) { -; CHECK: atom.add.f32 +; CHECK-LABEL: atomic_add_f32_generic( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_generic_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_generic_param_1]; +; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val) ret float %ret } @@ -154,7 +342,17 @@ declare float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %v ; CHECK-LABEL: atomic_add_f32_addrspace1 define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) { -; CHECK: atom.global.add.f32 +; CHECK-LABEL: atomic_add_f32_addrspace1( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace1_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace1_param_1]; +; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val) ret float %ret } @@ -163,49 +361,152 @@ declare float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %v ; CHECK-LABEL: atomic_add_f32_addrspace3 define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { -; CHECK: atom.shared.add.f32 +; CHECK-LABEL: atomic_add_f32_addrspace3( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace3_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace3_param_1]; +; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val) ret float %ret } ; CHECK-LABEL: atomicrmw_add_f32_generic define float @atomicrmw_add_f32_generic(ptr %addr, float %val) { -; CHECK: atom.add.f32 +; CHECK-LABEL: atomicrmw_add_f32_generic( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_generic_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_generic_param_1]; +; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, float %val seq_cst ret float %ret } ; CHECK-LABEL: atomicrmw_add_f16_generic define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { -; CHECK: atom.cas +; CHECK-LABEL: atomicrmw_add_f16_generic( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .f32 %f<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1]; +; CHECK-NEXT: ld.param.u64 %rd2, [atomicrmw_add_f16_generic_param_0]; +; CHECK-NEXT: and.b64 %rd1, %rd2, -4; +; CHECK-NEXT: cvt.u32.u64 %r6, %rd2; +; CHECK-NEXT: and.b32 %r7, %r6, 3; +; CHECK-NEXT: shl.b32 %r1, %r7, 3; +; CHECK-NEXT: mov.b32 %r8, 65535; +; CHECK-NEXT: shl.b32 %r9, %r8, %r1; +; CHECK-NEXT: not.b32 %r2, %r9; +; CHECK-NEXT: ld.u32 %r16, [%rd1]; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; +; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: shr.u32 %r10, %r16, %r1; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r10; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs3; +; CHECK-NEXT: shl.b32 %r12, %r11, %r1; +; CHECK-NEXT: and.b32 %r13, %r16, %r2; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: membar.sys; +; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14; +; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16; +; CHECK-NEXT: mov.u32 %r16, %r5; +; CHECK-NEXT: @%p1 bra $L__BB22_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: shr.u32 %r15, %r5, %r1; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r15; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret } ; CHECK-LABEL: atomicrmw_add_f32_addrspace1 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) { -; CHECK: atom.global.add.f32 +; CHECK-LABEL: atomicrmw_add_f32_addrspace1( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace1_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace1_param_1]; +; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst ret float %ret } ; CHECK-LABEL: atomicrmw_add_f32_addrspace3 define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { -; CHECK: atom.shared.add.f32 +; CHECK-LABEL: atomicrmw_add_f32_addrspace3( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace3_param_0]; +; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace3_param_1]; +; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst ret float %ret } ; CHECK-LABEL: atomic_cmpxchg_i32 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { -; CHECK: atom.cas.b32 +; CHECK-LABEL: atomic_cmpxchg_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i32_param_0]; +; CHECK-NEXT: membar.sys; +; CHECK-NEXT: ld.param.u32 %r1, [atomic_cmpxchg_i32_param_1]; +; CHECK-NEXT: ld.param.u32 %r2, [atomic_cmpxchg_i32_param_2]; +; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new } ; CHECK-LABEL: atomic_cmpxchg_i64 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) { -; CHECK: atom.cas.b64 +; CHECK-LABEL: atomic_cmpxchg_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i64_param_0]; +; CHECK-NEXT: membar.sys; +; CHECK-NEXT: ld.param.u64 %rd2, [atomic_cmpxchg_i64_param_1]; +; CHECK-NEXT: ld.param.u64 %rd3, [atomic_cmpxchg_i64_param_2]; +; CHECK-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll new file mode 100644 index 0000000000000..ea308c2a7673b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -0,0 +1,5680 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} + +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB0_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB0_1; +; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB1_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB1_1; +; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB2_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB2_1; +; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB3_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB3_1; +; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB4_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB4_1; +; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB5_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB5_1; +; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB6_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB6_1; +; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB7_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB7_1; +; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB8_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB8_1; +; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB9_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB9_1; +; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB10_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB10_1; +; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB11_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB11_1; +; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB12_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB12_1; +; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB13_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB13_1; +; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB14_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB14_1; +; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB15_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB15_1; +; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB16_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB16_1; +; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB17_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB17_1; +; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB18_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB18_1; +; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB19_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB19_1; +; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB20_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB20_1; +; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB21_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB21_1; +; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB22_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB22_1; +; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB23_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB23_1; +; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB24_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB24_1; +; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB25_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB25_1; +; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB26_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB26_1; +; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB27_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB27_1; +; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB28_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB28_1; +; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB29_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB29_1; +; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB30_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB30_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB30_1; +; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB31_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB31_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB31_1; +; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB32_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB32_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB32_1; +; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB33_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB33_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB33_1; +; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB34_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB34_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB34_1; +; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB35_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB35_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB35_1; +; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB36_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB36_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB36_1; +; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB37_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB37_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB37_1; +; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB38_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB38_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB38_1; +; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB39_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB39_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB39_1; +; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB40_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB40_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB40_1; +; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB41_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB41_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB41_1; +; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB42_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB42_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB42_1; +; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB43_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB43_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB43_1; +; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB44_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB44_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB44_1; +; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB45_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB45_1; +; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB46_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB46_1; +; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB47_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB47_1; +; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB48_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB48_1; +; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB49_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB49_1; +; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB50_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB50_1; +; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB51_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB51_1; +; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB52_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB52_1; +; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB53_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB53_1; +; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB54_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB54_1; +; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB55_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB55_1; +; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB56_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB56_1; +; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB57_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB57_1; +; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB58_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB58_1; +; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB59_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB59_1; +; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB60_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB60_1; +; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB61_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB61_1; +; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB62_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB62_1; +; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB63_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB63_1; +; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB64_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB64_1; +; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB65_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB65_1; +; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB66_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB66_1; +; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB67_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB67_1; +; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB68_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB68_1; +; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB69_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB69_1; +; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB70_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB70_1; +; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB71_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB71_1; +; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB72_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB72_1; +; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB73_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB73_1; +; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB74_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB74_1; +; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB75_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB75_1; +; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB76_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB76_1; +; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB77_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB77_1; +; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB78_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB78_1; +; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB79_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB79_1; +; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB80_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB80_1; +; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB81_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB81_1; +; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB82_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB82_1; +; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB83_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB83_1; +; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB84_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB84_1; +; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB85_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB85_1; +; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB86_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB86_1; +; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB87_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB87_1; +; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB88_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB88_1; +; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB89_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB89_1; +; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll new file mode 100644 index 0000000000000..4360ea36e863a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -0,0 +1,5680 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} + +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB0_1; +; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB1_1; +; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB2_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB2_1; +; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB3_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB3_1; +; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB4_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB4_1; +; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB5_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB5_1; +; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB6_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB6_1; +; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB7_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB7_1; +; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB8_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB8_1; +; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB9_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB9_1; +; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB10_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB10_1; +; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB11_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB11_1; +; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB12_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB12_1; +; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB13_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB13_1; +; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB14_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB14_1; +; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB15_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB15_1; +; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB16_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB16_1; +; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB17_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB17_1; +; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB18_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB18_1; +; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB19_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB19_1; +; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB20_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB20_1; +; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB21_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB21_1; +; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB22_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB22_1; +; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB23_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB23_1; +; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB24_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB24_1; +; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB25_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB25_1; +; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB26_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB26_1; +; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB27_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB27_1; +; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB28_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB28_1; +; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB29_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB29_1; +; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB30_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB30_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB30_1; +; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB31_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB31_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB31_1; +; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB32_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB32_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB32_1; +; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB33_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB33_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB33_1; +; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB34_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB34_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB34_1; +; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB35_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB35_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB35_1; +; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB36_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB36_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB36_1; +; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB37_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB37_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB37_1; +; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB38_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB38_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB38_1; +; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB39_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB39_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB39_1; +; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB40_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB40_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB40_1; +; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB41_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB41_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB41_1; +; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB42_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB42_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB42_1; +; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB43_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB43_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB43_1; +; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB44_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB44_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB44_1; +; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB45_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB45_1; +; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB46_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB46_1; +; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB47_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB47_1; +; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB48_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB48_1; +; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB49_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB49_1; +; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB50_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB50_1; +; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB51_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB51_1; +; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB52_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB52_1; +; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB53_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB53_1; +; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB54_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB54_1; +; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB55_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB55_1; +; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB56_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB56_1; +; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB57_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB57_1; +; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB58_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB58_1; +; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB59_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB59_1; +; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB60_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB60_1; +; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB61_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB61_1; +; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB62_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB62_1; +; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB63_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB63_1; +; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB64_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB64_1; +; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB65_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB65_1; +; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB66_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB66_1; +; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB67_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB67_1; +; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB68_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB68_1; +; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB69_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB69_1; +; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB70_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB70_1; +; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB71_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB71_1; +; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB72_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB72_1; +; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB73_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB73_1; +; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB74_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB74_1; +; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB75_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB75_1; +; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB76_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB76_1; +; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB77_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB77_1; +; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB78_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB78_1; +; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB79_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB79_1; +; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB80_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB80_1; +; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB81_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB81_1; +; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB82_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB82_1; +; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB83_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB83_1; +; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB84_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB84_1; +; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB85_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB85_1; +; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB86_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB86_1; +; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB87_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB87_1; +; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB88_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB88_1; +; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB89_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB89_1; +; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll new file mode 100644 index 0000000000000..5acb275a6f581 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -0,0 +1,5680 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} + +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB0_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB0_1; +; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB1_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB1_1; +; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB2_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB2_1; +; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB3_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB3_1; +; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB4_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB4_1; +; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB5_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB5_1; +; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB6_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB6_1; +; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB7_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB7_1; +; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB8_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB8_1; +; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + ret i8 %new +} + +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB9_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB9_1; +; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB10_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB10_1; +; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB11_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB11_1; +; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + ret i8 %new +} + +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB12_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB12_1; +; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB13_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB13_1; +; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB14_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB14_1; +; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB15_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB15_1; +; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB16_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB16_1; +; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB17_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB17_1; +; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + ret i8 %new +} + +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB18_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB18_1; +; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB19_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB19_1; +; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB20_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB20_1; +; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB21_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB21_1; +; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB22_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB22_1; +; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB23_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB23_1; +; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB24_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB24_1; +; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB25_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB25_1; +; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB26_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB26_1; +; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB27_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB27_1; +; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB28_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB28_1; +; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB29_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB29_1; +; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB30_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB30_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB30_1; +; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB31_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB31_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB31_1; +; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB32_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB32_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB32_1; +; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB33_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB33_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB33_1; +; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB34_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB34_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB34_1; +; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB35_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB35_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB35_1; +; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB36_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB36_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB36_1; +; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB37_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB37_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB37_1; +; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB38_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB38_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB38_1; +; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB39_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB39_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB39_1; +; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB40_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB40_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB40_1; +; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB41_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB41_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB41_1; +; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB42_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB42_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB42_1; +; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB43_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB43_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB43_1; +; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB44_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB44_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB44_1; +; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB45_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB45_1; +; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB46_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB46_1; +; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB47_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB47_1; +; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB48_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB48_1; +; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB49_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB49_1; +; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB50_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB50_1; +; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB51_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB51_1; +; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB52_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB52_1; +; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB53_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB53_1; +; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB54_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB54_1; +; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB55_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB55_1; +; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB56_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB56_1; +; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB57_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB57_1; +; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB58_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB58_1; +; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB59_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB59_1; +; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB60_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB60_1; +; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB61_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB61_1; +; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB62_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB62_1; +; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB63_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB63_1; +; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB64_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB64_1; +; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB65_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB65_1; +; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB66_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB66_1; +; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB67_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB67_1; +; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB68_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB68_1; +; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB69_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB69_1; +; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB70_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB70_1; +; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB71_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB71_1; +; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB72_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB72_1; +; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB73_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB73_1; +; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB74_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB74_1; +; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB75_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB75_1; +; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB76_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB76_1; +; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB77_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB77_1; +; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB78_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB78_1; +; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB79_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB79_1; +; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB80_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB80_1; +; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB81_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB81_1; +; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB82_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB82_1; +; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB83_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB83_1; +; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB84_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB84_1; +; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB85_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB85_1; +; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB86_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB86_1; +; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB87_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB87_1; +; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB88_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB88_1; +; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB89_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB89_1; +; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 33a1f15c6a5cd..aaea0d2ee25ef 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -7,6 +7,7 @@ ; TODO: these are system scope, but are compiled to gpu scope.. ; TODO: these are seq_cst, but are compiled to relaxed.. + ; CHECK-LABEL: relaxed_sys_i8 define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-LABEL: relaxed_sys_i8( @@ -17,86 +18,1153 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB0_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB0_1; +; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB0_1; +; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB0_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB0_1; +; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + ret i8 %new +} + +define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: acquire_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB1_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB1_1; +; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB1_1; +; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB1_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB1_1; +; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + ret i8 %new +} + +define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: release_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB2_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB2_1; +; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB2_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB2_1; +; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB2_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB2_1; +; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + ret i8 %new +} + +define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: acq_rel_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB3_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB3_1; +; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: acq_rel_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB3_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB3_1; +; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: acq_rel_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB3_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB3_1; +; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new +} + +define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { +; SM30-LABEL: seq_cst_sys_i8( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r17, %r20, %r3; +; SM30-NEXT: or.b32 %r18, %r20, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM30-NEXT: @%p1 bra $L__BB4_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM30-NEXT: mov.u32 %r20, %r8; +; SM30-NEXT: @%p2 bra $L__BB4_1; +; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: ret; +; +; SM70-LABEL: seq_cst_sys_i8( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB4_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB4_1; +; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; +; SM90-LABEL: seq_cst_sys_i8( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB4_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB4_1; +; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +; CHECK-LABEL: relaxed_sys_i16 +define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: relaxed_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB5_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB5_1; +; SM30-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: relaxed_sys_i16( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB5_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB5_1; +; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB5_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB5_1; +; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: acquire_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB6_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB6_1; +; SM30-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i16( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB6_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB6_1; +; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB6_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB6_1; +; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + +define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: release_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: and.b64 %rd1, %rd2, -4; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM30-NEXT: // =>This Inner Loop Header: Depth=1 +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB7_3; +; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM30-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM30-NEXT: and.b32 %r8, %r7, %r2; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB7_1; +; SM30-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i16( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB7_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB7_1; +; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB7_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB7_1; +; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + +define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: acq_rel_sys_i16( +; SM30: { +; SM30-NEXT: .reg .pred %p<3>; +; SM30-NEXT: .reg .b16 %rs<2>; +; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b64 %rd<3>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 65535; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; -; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM30-NEXT: shl.b32 %r4, %r9, %r1; +; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM30-NEXT: @%p1 bra $L__BB0_3; +; SM30-NEXT: or.b32 %r16, %r19, %r3; +; SM30-NEXT: or.b32 %r17, %r19, %r4; +; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM30-NEXT: @%p1 bra $L__BB8_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM30-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM30-NEXT: mov.u32 %r20, %r8; -; SM30-NEXT: @%p2 bra $L__BB0_1; -; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM30-NEXT: mov.u32 %r19, %r8; +; SM30-NEXT: @%p2 bra $L__BB8_1; +; SM30-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; -; SM70-LABEL: relaxed_sys_i8( +; SM70-LABEL: acq_rel_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB0_3; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB0_1; -; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB8_1; +; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic - ret i8 %new +; SM90-LABEL: acq_rel_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB8_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB8_1; +; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new } -; CHECK-LABEL: relaxed_sys_i16 -define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { -; SM30-LABEL: relaxed_sys_i16( +; CHECK-LABEL: seq_cst_sys_i16 +define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { +; SM30-LABEL: seq_cst_sys_i16( ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; @@ -104,10 +1172,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -119,24 +1188,25 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; -; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 ; SM30-NEXT: or.b32 %r16, %r19, %r3; ; SM30-NEXT: or.b32 %r17, %r19, %r4; ; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM30-NEXT: @%p1 bra $L__BB1_3; +; SM30-NEXT: @%p1 bra $L__BB9_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM30-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM30-NEXT: and.b32 %r8, %r7, %r2; ; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM30-NEXT: mov.u32 %r19, %r8; -; SM30-NEXT: @%p2 bra $L__BB1_1; -; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM30-NEXT: @%p2 bra $L__BB9_1; +; SM30-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM30-NEXT: membar.sys; ; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; -; SM70-LABEL: relaxed_sys_i16( +; SM70-LABEL: seq_cst_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -144,10 +1214,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -159,23 +1230,65 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB1_3; +; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.u32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB1_1; -; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB9_1; +; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic +; SM90-LABEL: seq_cst_sys_i16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB9_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB9_1; +; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } @@ -203,13 +1316,197 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; ; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; -; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic ret i32 %new } +define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: acq_rel_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: acq_rel_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: acq_rel_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + +define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: acquire_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + +define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: release_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + +define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { +; SM30-LABEL: seq_cst_sys_i32( +; SM30: { +; SM30-NEXT: .reg .b32 %r<4>; +; SM30-NEXT: .reg .b64 %rd<2>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; +; SM30-NEXT: ret; +; +; SM70-LABEL: seq_cst_sys_i32( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; +; SM90-LABEL: seq_cst_sys_i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + ; CHECK-LABEL: relaxed_sys_i64 define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-LABEL: relaxed_sys_i64( @@ -232,11 +1529,183 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; ; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; -; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; +; SM90-LABEL: relaxed_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic ret i64 %new } + +define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: acquire_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: acquire_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: acquire_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + +define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: acq_rel_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: acq_rel_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: acq_rel_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + +define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: release_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: release_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: release_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + +define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { +; SM30-LABEL: seq_cst_sys_i64( +; SM30: { +; SM30-NEXT: .reg .b64 %rd<5>; +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; +; SM30-NEXT: ret; +; +; SM70-LABEL: seq_cst_sys_i64( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; +; SM90-LABEL: seq_cst_sys_i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py new file mode 100644 index 0000000000000..ae7450015ecd2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -0,0 +1,57 @@ +# For manual usage, not as a part of lit tests. Used for generating the following tests: +# cmpxchg-sm30.ll, cmpxchg-sm70.ll, cmpxchg-sm90.ll + +from string import Template +from itertools import product + +cmpxchg_func = Template( + """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure + ret i$size %new +} +""" +) + +run_statement = Template( + """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %} +""" +) + +TESTS = [(60, 50), (70, 63), (90, 87)] + +LLVM_SCOPES = ["", "block", "cluster", "device"] + +SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"} + +SUCCESS_ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"] + +FAILURE_ORDERINGS = ["monotonic", "acquire", "seq_cst"] + +SIZES = [8, 16, 32, 64] + +ADDRSPACES = [0, 1, 3] + +ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"} + +if __name__ == "__main__": + for sm, ptx in TESTS: + with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: + print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) + for size, success, failure, addrspace in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES + ): + if addrspace == 0: + addrspace_cast = "" + else: + addrspace_cast = " addrspace({})".format(str(addrspace)) + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=addrspace_cast, + ), + file=fp, + ) diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg index 54a6c338bdf85..84cce669ec10b 100644 --- a/llvm/test/CodeGen/NVPTX/lit.local.cfg +++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg @@ -1,4 +1,4 @@ if not "NVPTX" in config.root.targets: config.unsupported = True config.suffixes.add(".py") -config.excludes = ["fence.py"] +config.excludes = ["fence.py", "cmpxchg.py"]