From 38db2d3d9066d736b5f4a43e59d0c4ab8c68dab0 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 9 May 2025 01:00:43 +0000 Subject: [PATCH] [NVPTX] Add syncscope support for cmpxchg --- llvm/include/llvm/CodeGen/TargetLowering.h | 16 +- llvm/lib/CodeGen/AtomicExpandPass.cpp | 18 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +- llvm/lib/Target/ARM/ARMISelLowering.h | 10 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 13 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 12 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 67 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelLowering.h | 12 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 12 +- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 8 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 16140 +++++++++++-- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 16238 +++++++++++-- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 21932 ++++++++++++++++-- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 40 +- llvm/test/CodeGen/NVPTX/cmpxchg.py | 13 +- 18 files changed, 47182 insertions(+), 7377 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index ac9ab7f7fd210..265f1fd724237 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2286,13 +2286,15 @@ class TargetLoweringBase { /// standard ABI uses a fence before a seq_cst load instead of after a /// seq_cst store). /// @{ - virtual Instruction *emitLeadingFence(IRBuilderBase &Builder, - Instruction *Inst, - AtomicOrdering Ord) const; - - virtual Instruction *emitTrailingFence(IRBuilderBase &Builder, - Instruction *Inst, - AtomicOrdering Ord) const; + virtual Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const; + + virtual Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const; /// @} // Emits code that executes when the comparison result in the ll/sc diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index a3e9700fa3089..1b9e0056eae74 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -65,7 +65,8 @@ class AtomicExpandImpl { const DataLayout *DL = nullptr; private: - bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, + SyncScope::ID SSID = SyncScope::System); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); bool tryExpandAtomicLoad(LoadInst *LI); @@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; + SyncScope::ID SSID = SyncScope::System; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); @@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); + SSID = CASI->getSyncScopeID(); CASI->setSuccessOrdering(CASOrdering); CASI->setFailureOrdering(CASOrdering); + // If CAS ordering is monotonic, then the operation will + // take default scope. Otherwise, it will retain its scope + if (CASOrdering != AtomicOrdering::Monotonic) + CASI->setSyncScopeID(SSID); } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering); + MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID); } } else if (I->hasAtomicStore() && TLI->shouldInsertTrailingFenceForAtomicStore(I)) { @@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F, } bool AtomicExpandImpl::bracketInstWithFences(Instruction *I, - AtomicOrdering Order) { + AtomicOrdering Order, + SyncScope::ID SSID) { ReplacementIRBuilder Builder(I, *DL); - auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID); - auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID); // We have a guard here because not every atomic operation generates a // trailing fence. if (TrailingFence) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index f5ea3c0b47d6a..61d8b1de30ff7 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2320,18 +2320,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) - return Builder.CreateFence(Ord); + return Builder.CreateFence(Ord, SSID); else return nullptr; } Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (isAcquireOrStronger(Ord)) - return Builder.CreateFence(Ord); + return Builder.CreateFence(Ord, SSID); else return nullptr; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index e3dc337bd0843..9bd5166c19c24 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21221,7 +21221,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -21246,7 +21247,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 9fad056edd3f1..da09eca2b946f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -666,10 +666,12 @@ class VectorType; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction *emitLeadingFence( + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + Instruction *emitTrailingFence( + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3e755c25fd91a..946c44ac82abb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6055,7 +6055,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (!isa(Inst)) return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); @@ -6063,15 +6064,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated if (isReleaseOrStronger(Ord)) return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) - : Builder.CreateFence(AtomicOrdering::Release); + ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, + SSID) + : Builder.CreateFence(AtomicOrdering::Release, SSID); return nullptr; } Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { // Specialize for cmpxchg if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); @@ -6084,7 +6087,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || CASWidth < STI.getMinCmpXchgSizeInBits())) - return Builder.CreateFence(AtomicOrdering::Acquire); + return Builder.CreateFence(AtomicOrdering::Acquire, SSID); return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index f41c569a65544..07304adf21ac2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -266,10 +266,14 @@ class NVPTXTargetLowering : public TargetLowering { AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; private: const NVPTXSubtarget &STI; // cache the subtarget here diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 7d7e69adafcd0..e02c335bc8d13 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -38,6 +38,27 @@ def AS_match { }]; } +multiclass nvvm_ternary_atomic_op_scoped { + defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val); + def NAME#_cta: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Block; + }]>; + def NAME#_cluster : PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Cluster; + }]>; + def NAME#_gpu: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Device; + }]>; + def NAME#_sys: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::System; + }]>; +} + + // A node that will be replaced with the current PTX version. class PTX { SDNodeXForm PTXVerXform = SDNodeXForm Pred> { + ValueType regT, NVPTXRegClass regclass, string SemStr, + string ScopeStr, string SpaceStr, string TypeStr, string OpcStr, + PatFrag IntOp, Operand IMMType, list Pred> { let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, regclass:$c), - !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>, Requires; def imm1 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, regclass:$c), - !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>, Requires; def imm2 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, IMMType:$c), - !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), + !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>, Requires; def imm3 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, IMMType:$c), - !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), + !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>, Requires; } } -multiclass F_ATOMIC_3 Pred = []> { - defm p32 : F_ATOMIC_3_imp Pred = []> { + defm p32 : F_ATOMIC_3_imp; - defm p64 : F_ATOMIC_3_imp; } @@ -2469,10 +2491,12 @@ foreach size = ["i16", "i32", "i64"] in { // ".cas", atomic_cmp_swap_i32_acquire_global, i32imm, // [hasSM<70>, hasPTX<63>]> multiclass INT_PTX_ATOM_CAS preds> + string order, string scope, string addrspace, + list preds> : F_ATOMIC_3("i"#type), !cast("Int"#type#"Regs"), order, + scope, addrspace, ".b"#type, ".cas", @@ -2487,26 +2511,35 @@ foreach size = ["32", "64"] in { defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace); foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); + defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#size#_#order#_#addrspace); + defm atomic_cmp_swap_i#size#_#order#_#addrspace: nvvm_ternary_atomic_op_scoped; + + foreach scope = ["cta", "cluster", "gpu", "sys"] in { + defm INT_PTX_ATOM_CAS_#size#_#order#addrspace#scope + : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace#_#scope, size, + cas_order_string, "."#scope, cas_addrspace_string, + [hasSM<70>, hasPTX<63>]>; + } // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. defm INT_PTX_ATOM_CAS_#size#_#order#addrspace : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - cas_order_string, cas_addrspace_string, + cas_order_string, "", cas_addrspace_string, [hasSM<70>, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - "", cas_addrspace_string, []>; + "", "", cas_addrspace_string, []>; } } } // Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; // Support for scoped atomic operations. Matches diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 91df5f467e59c..53a3fcc1008b7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12430,7 +12430,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) { // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -12440,7 +12441,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 1f22aa16a89be..8e02c0dbc0fca 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -921,10 +921,14 @@ namespace llvm { return true; } - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; bool shouldInlineQuadwordAtomics() const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4e6b3a224b79b..2c4000e837f09 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22094,7 +22094,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint( Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); @@ -22110,7 +22111,8 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 26b888653c81d..0f3b50779b30b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -708,10 +708,14 @@ class RISCVTargetLowering : public TargetLowering { // than this hook due to limitations in the interface here. bool shouldInsertFencesForAtomic(const Instruction *I) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index 9027bd6a14780..98afc792b3b0b 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; ; CHECKPTX71-NEXT: mov.u32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; ; CHECKPTX71-NEXT: mov.u32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; ; CHECKPTX71-NEXT: mov.u32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; ; CHECKPTX71-NEXT: mov.u32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index ea308c2a7673b..3c00f9585254f 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -158,15 +158,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -344,15 +338,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -437,7 +428,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -458,12 +449,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -471,8 +462,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -483,15 +474,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -517,8 +508,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -529,15 +520,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -563,8 +554,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -575,15 +566,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -596,12 +587,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -609,8 +600,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -621,7 +612,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -655,8 +646,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -667,15 +658,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -701,9 +692,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -714,15 +704,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB15_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -735,12 +725,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -748,9 +738,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -761,15 +750,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB16_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -795,9 +784,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -808,7 +796,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -842,8 +830,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -855,7 +843,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -888,9 +877,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -901,15 +890,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB19_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -934,9 +924,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -947,15 +937,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB20_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -980,8 +971,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -993,15 +984,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1014,12 +1005,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1027,9 +1018,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1040,7 +1031,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1074,9 +1065,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1087,15 +1078,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB23_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1121,8 +1112,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,15 +1125,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB24_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1155,12 +1146,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1168,9 +1159,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1181,15 +1172,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1215,9 +1206,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1228,7 +1219,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1262,9 +1253,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1275,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1296,12 +1286,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1309,9 +1299,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1322,15 +1311,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB28_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1356,9 +1345,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1369,15 +1357,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1403,9 +1391,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1416,15 +1403,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB30_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1437,12 +1424,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1450,9 +1437,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1463,7 +1449,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1497,9 +1483,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1510,15 +1495,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB32_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1544,9 +1529,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1557,15 +1541,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB33_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1578,12 +1562,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1591,9 +1575,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1604,15 +1587,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB34_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB34_1; ; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1638,9 +1621,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB35_1; ; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB37_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB38_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1839,15 +1817,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB39_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1860,12 +1838,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB41_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB42_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2001,12 +1976,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2027,15 +2001,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB43_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -2092,3589 +2065,14968 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB45_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB45_1; ; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB46_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB46_1; ; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB47_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB47_1; ; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB48_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB48_1; ; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB49_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB50_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB51_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB52_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB53_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB54_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB55_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB56_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB57_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB58_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB59_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB61_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB62_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB63_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB66_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB67_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB68_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB69_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB69_1; ; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB70_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB70_1; ; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB71_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB71_1; ; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB72_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB73_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB74_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB75_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB76_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB77_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB78_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB79_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB80_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB81_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB82_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB83_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB84_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB84_1; ; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB85_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB86_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB87_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB88_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB89_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB90_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB90_1; +; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB91_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB91_1; +; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB92_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB92_1; +; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB93_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB93_1; +; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB94_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB94_1; +; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB95_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB95_1; +; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB96_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB96_1; +; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB97_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB97_1; +; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB98_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB98_1; +; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB99_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB99_1; +; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB100_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB100_1; +; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB101_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB101_1; +; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB102_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB102_1; +; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB103_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB103_1; +; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB104_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB104_1; +; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB105_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB105_1; +; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB106_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB106_1; +; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB107_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB107_1; +; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB108_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB108_1; +; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB109_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB109_1; +; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB110_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB110_1; +; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB111_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB111_1; +; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB112_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB112_1; +; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB113_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB113_1; +; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB114_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB114_1; +; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB115_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB115_1; +; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB116_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB116_1; +; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB117_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB117_1; +; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB118_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB118_1; +; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB119_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB119_1; +; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB120_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB120_1; +; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB121_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB121_1; +; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB122_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB122_1; +; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB123_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB123_1; +; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB124_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB124_1; +; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB125_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB125_1; +; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB126_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB126_1; +; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB127_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB127_1; +; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB128_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB128_1; +; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB129_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB129_1; +; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB130_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB130_1; +; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB131_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB131_1; +; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB132_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB132_1; +; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB133_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB133_1; +; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB134_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB134_1; +; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB135_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB135_1; +; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB136_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB136_1; +; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB137_1; +; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB138_1; +; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB139_1; +; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB140_1; +; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB141_1; +; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB142_1; +; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB143_1; +; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB144_1; +; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB145_1; +; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB146_1; +; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB147_1; +; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB148_1; +; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB149_1; +; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB150_1; +; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB151_1; +; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB152_1; +; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB153_1; +; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB154_1; +; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB155_1; +; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB156_1; +; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB157_1; +; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB158_1; +; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB159_1; +; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB160_1; +; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB161_1; +; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB162_1; +; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB163_1; +; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB164_1; +; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB165_1; +; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB166_1; +; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB167_1; +; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB168_1; +; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB169_1; +; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB170_1; +; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB171_1; +; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB172_1; +; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB173_1; +; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB174_1; +; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB175_1; +; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB176_1; +; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB177_1; +; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB178_1; +; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB179_1; +; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB180_1; +; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB181_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB181_1; +; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB182_1; +; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB183_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB183_1; +; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB184_1; +; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB185_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB185_1; +; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB186_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB186_1; +; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB187_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB187_1; +; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB188_1; +; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB189_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB189_1; +; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB190_1; +; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB191_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB191_1; +; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB192_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB192_1; +; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB193_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB193_1; +; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB194_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB194_1; +; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB195_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB195_1; +; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB196_1; +; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB197_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB197_1; +; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB198_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB198_1; +; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB199_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB199_1; +; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB200_1; +; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB201_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB201_1; +; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB202_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB202_1; +; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB203_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB203_1; +; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB204_1; +; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB205_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB205_1; +; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB206_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB206_1; +; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB207_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB207_1; +; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB208_1; +; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB209_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB209_1; +; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB210_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB210_1; +; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB211_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB211_1; +; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB212_1; +; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB213_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB213_1; +; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB214_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB214_1; +; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB215_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB215_1; +; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB216_1; +; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB217_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB217_1; +; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB218_1; +; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB219_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB219_1; +; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB220_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB220_1; +; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB221_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB221_1; +; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB222_1; +; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB223_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB223_1; +; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB224_1; +; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB225_1; +; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB226_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB226_1; +; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB227_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB227_1; +; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB228_1; +; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB229_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB229_1; +; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB230_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB230_1; +; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB231_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB231_1; +; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB232_1; +; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB233_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB233_1; +; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB234_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB234_1; +; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB235_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB235_1; +; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB236_1; +; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB237_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB237_1; +; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB238_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB238_1; +; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB239_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB239_1; +; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB240_1; +; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB241_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB241_1; +; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB242_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB242_1; +; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB243_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB243_1; +; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB244_1; +; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB245_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB245_1; +; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB246_1; +; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB247_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB247_1; +; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB248_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB248_1; +; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB249_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB249_1; +; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB250_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB250_1; +; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB251_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB251_1; +; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB252_1; +; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB253_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB253_1; +; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB254_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB254_1; +; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB255_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB255_1; +; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB256_1; +; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB257_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB257_1; +; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB258_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB258_1; +; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB259_1; +; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB260_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB260_1; +; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB261_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB261_1; +; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB262_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB262_1; +; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB263_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB263_1; +; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB264_1; +; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB265_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB265_1; +; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB266_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB266_1; +; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB267_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB267_1; +; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB268_1; +; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB269_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB269_1; +; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic( +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global( +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared( +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic( +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared( +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic( +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global( +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared( +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic( +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global( +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared( +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic( +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared( +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global( +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic( +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global( +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared( +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic( +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared( +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic( +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global( +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared( +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared( +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 4360ea36e863a..d8f961be05ab0 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -158,15 +158,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -344,15 +338,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -437,7 +428,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -445,7 +436,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -458,12 +449,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -471,8 +462,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -483,15 +474,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -517,8 +508,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -529,15 +520,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -563,8 +554,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -575,15 +566,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -596,12 +587,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -609,8 +600,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -621,7 +612,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -629,7 +620,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -655,8 +646,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -667,15 +658,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -701,9 +692,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -714,15 +704,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -735,12 +725,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -748,9 +738,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -761,15 +750,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -795,9 +784,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -808,7 +796,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -816,7 +804,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -842,9 +830,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -855,7 +843,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -863,7 +851,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -888,9 +877,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -901,15 +890,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -934,9 +924,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -947,15 +937,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -980,9 +971,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -993,15 +984,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1014,12 +1005,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1027,9 +1018,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1040,7 +1031,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1048,7 +1039,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1074,9 +1065,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1087,15 +1078,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1121,8 +1112,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,15 +1125,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1155,12 +1146,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1168,9 +1159,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1181,15 +1172,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1215,9 +1206,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1228,7 +1219,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1236,7 +1227,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1262,9 +1253,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1275,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1283,7 +1273,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1296,12 +1286,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1309,9 +1299,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1322,15 +1311,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1356,9 +1345,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1369,15 +1357,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1403,9 +1391,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1416,15 +1403,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB30_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1437,12 +1424,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1450,9 +1437,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1463,7 +1449,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1471,7 +1457,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB31_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1497,9 +1483,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1510,15 +1495,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB32_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB32_1; ; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1544,9 +1529,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1557,15 +1541,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB33_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1578,12 +1562,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1591,9 +1575,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1604,15 +1587,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB34_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB34_1; ; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1638,9 +1621,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB35_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB35_1; ; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB36_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB37_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB37_1; ; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB38_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1839,15 +1817,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB39_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1860,12 +1838,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB40_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB40_1; ; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB41_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB41_1; ; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB42_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2001,12 +1976,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2027,15 +2001,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB43_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB44_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2092,3589 +2065,14968 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB44_1; ; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB45_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB45_1; ; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB46_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB46_1; ; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB47_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB47_1; ; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB48_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB48_1; ; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB49_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB50_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB51_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB52_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB53_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB54_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB55_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB56_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB57_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB58_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB59_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB61_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB61_1; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB62_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB63_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB66_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB67_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB68_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB69_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB69_1; ; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB70_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB70_1; ; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB71_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB71_1; ; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB72_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB73_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB74_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB75_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB76_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB77_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB78_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB79_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB80_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB81_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB82_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB83_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB84_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB84_1; ; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB85_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB86_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB87_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB88_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB89_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB89_1; +; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB90_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB90_1; +; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB91_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB91_1; +; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB92_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB92_1; +; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB93_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB93_1; +; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB94_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB94_1; +; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB95_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB95_1; +; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB96_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB96_1; +; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB97_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB97_1; +; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB98_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB98_1; +; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB99_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB99_1; +; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB100_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB100_1; +; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB101_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB101_1; +; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB102_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB102_1; +; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB103_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB103_1; +; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB104_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB104_1; +; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB105_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB105_1; +; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB106_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB106_1; +; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB107_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB107_1; +; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB108_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB108_1; +; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB109_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB109_1; +; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB110_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB110_1; +; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB111_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB111_1; +; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB112_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB112_1; +; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB113_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB113_1; +; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB114_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB114_1; +; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB115_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB115_1; +; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB116_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB116_1; +; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB117_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB117_1; +; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB118_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB118_1; +; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB119_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB119_1; +; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB120_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB120_1; +; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB121_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB121_1; +; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB122_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB122_1; +; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB123_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB123_1; +; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB124_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB124_1; +; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB125_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB125_1; +; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB126_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB126_1; +; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB127_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB127_1; +; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB128_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB128_1; +; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB129_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB129_1; +; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB130_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB130_1; +; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB131_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB131_1; +; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB132_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB132_1; +; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB133_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB133_1; +; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB134_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB134_1; +; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB135_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB135_1; +; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB136_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB136_1; +; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB137_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB137_1; +; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB138_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB138_1; +; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB139_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB139_1; +; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB140_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB140_1; +; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB141_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB141_1; +; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB142_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB142_1; +; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB143_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB143_1; +; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB144_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB144_1; +; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB145_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB145_1; +; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB146_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB146_1; +; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB147_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB147_1; +; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB148_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB148_1; +; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB149_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB149_1; +; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB150_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB150_1; +; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB151_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB151_1; +; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB152_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB152_1; +; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB153_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB153_1; +; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB154_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB154_1; +; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB155_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB155_1; +; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB156_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB156_1; +; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB157_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB157_1; +; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB158_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB158_1; +; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB159_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB159_1; +; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB160_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB160_1; +; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB161_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB161_1; +; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB162_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB162_1; +; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB163_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB163_1; +; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB164_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB164_1; +; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB165_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB165_1; +; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB166_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB166_1; +; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB167_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB167_1; +; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB168_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB168_1; +; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB169_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB169_1; +; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB170_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB170_1; +; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB171_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB171_1; +; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB172_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB172_1; +; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB173_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB173_1; +; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB174_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB174_1; +; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB175_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB175_1; +; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB176_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB176_1; +; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB177_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB177_1; +; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB178_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB178_1; +; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB179_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB179_1; +; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB180_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB180_1; +; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB181_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB181_1; +; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB182_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB182_1; +; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB183_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB183_1; +; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB184_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB184_1; +; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB185_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB185_1; +; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB186_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB186_1; +; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB187_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB187_1; +; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB188_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB188_1; +; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB189_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB189_1; +; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB190_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB190_1; +; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB191_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB191_1; +; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB192_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB192_1; +; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB193_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB193_1; +; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB194_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB194_1; +; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB195_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB195_1; +; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB196_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB196_1; +; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB197_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB197_1; +; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB198_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB198_1; +; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB199_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB199_1; +; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB200_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB200_1; +; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB201_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB201_1; +; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB202_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB202_1; +; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB203_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB203_1; +; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB204_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB204_1; +; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB205_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB205_1; +; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB206_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB206_1; +; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB207_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB207_1; +; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB208_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB208_1; +; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB209_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB209_1; +; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB210_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB210_1; +; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB211_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB211_1; +; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB212_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB212_1; +; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB213_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB213_1; +; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB214_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB214_1; +; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB215_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB215_1; +; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB216_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB216_1; +; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB217_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB217_1; +; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB218_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB218_1; +; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB219_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB219_1; +; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB220_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB220_1; +; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB221_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB221_1; +; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB222_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB222_1; +; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB223_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB223_1; +; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB224_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB224_1; +; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB225_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB225_1; +; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB226_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB226_1; +; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB227_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB227_1; +; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB228_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB228_1; +; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB229_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB229_1; +; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB230_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB230_1; +; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB231_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB231_1; +; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB232_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB232_1; +; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB233_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB233_1; +; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB234_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB234_1; +; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB235_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB235_1; +; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB236_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB236_1; +; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB237_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB237_1; +; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB238_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB238_1; +; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB239_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB239_1; +; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB240_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB240_1; +; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB241_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB241_1; +; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB242_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB242_1; +; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB243_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB243_1; +; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB244_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB244_1; +; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB245_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB245_1; +; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB246_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB246_1; +; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB247_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB247_1; +; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB248_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB248_1; +; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB249_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB249_1; +; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB250_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB250_1; +; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB251_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB251_1; +; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB252_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB252_1; +; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB253_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB253_1; +; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB254_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB254_1; +; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB255_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB255_1; +; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB256_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB256_1; +; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB257_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB257_1; +; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB258_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB258_1; +; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB259_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB259_1; +; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB260_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB260_1; +; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB261_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB261_1; +; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB262_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB262_1; +; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB263_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB263_1; +; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB264_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB264_1; +; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB265_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB265_1; +; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB266_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB266_1; +; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB267_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB267_1; +; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB268_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB268_1; +; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB269_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB269_1; +; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB89_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB89_1; -; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic( +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global( +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared( +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic( +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global( +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared( +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic( +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global( +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic( +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global( +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared( +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic( +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global( +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared( +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global( +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic( +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global( +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared( +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic( +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared( +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic( +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global( +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared( +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared( +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 5acb275a6f581..8eae5bfb0a133 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,7 +158,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -166,7 +166,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -344,7 +338,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -352,7 +346,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -437,15 +428,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -471,8 +461,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -483,15 +473,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -517,8 +506,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -529,7 +518,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -537,7 +526,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -563,8 +551,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -575,7 +563,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -583,7 +571,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -596,12 +584,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -609,8 +597,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -621,15 +609,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -639,15 +627,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -655,8 +643,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -667,15 +655,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -685,15 +673,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -701,9 +689,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -714,7 +701,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -722,7 +709,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -732,15 +719,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -748,9 +735,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -761,7 +747,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -769,7 +755,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -782,12 +768,12 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -795,9 +781,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -808,15 +793,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -826,15 +811,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -842,9 +827,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -855,15 +839,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -873,14 +857,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -888,9 +873,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -901,7 +885,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -909,7 +893,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -919,14 +903,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -934,9 +919,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -947,7 +931,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -955,7 +939,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -965,14 +949,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -980,9 +965,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -993,15 +977,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1011,15 +995,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1027,9 +1011,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1040,15 +1023,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1058,15 +1041,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1074,9 +1057,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1087,7 +1069,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1095,7 +1077,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1105,15 +1087,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1121,8 +1103,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,7 +1116,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1142,7 +1124,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1155,12 +1137,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1168,9 +1150,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1181,15 +1163,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB25_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1199,15 +1181,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1215,9 +1197,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1228,15 +1210,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1246,15 +1228,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1262,9 +1244,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1275,7 +1257,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1283,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1293,15 +1275,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1309,9 +1291,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1322,7 +1304,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1330,7 +1312,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1343,12 +1325,12 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1356,9 +1338,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1369,15 +1351,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1387,15 +1369,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1403,9 +1385,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1416,15 +1398,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB30_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1434,15 +1416,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB30_1; ; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1450,9 +1432,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1463,7 +1445,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1471,7 +1453,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB31_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1481,15 +1463,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB31_1; ; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1497,9 +1479,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1510,7 +1492,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1518,7 +1500,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB32_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1531,12 +1513,12 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1544,9 +1526,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1557,15 +1539,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB33_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1575,15 +1557,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1591,9 +1573,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1604,15 +1586,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB34_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1622,15 +1604,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1638,9 +1620,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB35_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB35_1; ; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB36_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB37_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB37_1; ; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB38_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB38_1; ; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1839,7 +1817,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1847,7 +1825,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB39_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1857,15 +1835,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB39_1; ; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB40_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1907,12 +1884,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB41_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB41_1; ; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB42_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1998,15 +1973,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2027,7 +2001,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2035,7 +2009,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB43_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB44_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2095,3586 +2068,20641 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB45_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB46_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB47_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB48_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB49_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB50_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB51_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB52_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB53_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic( +define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB54_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB55_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB56_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB56_1; ; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB57_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB57_1; ; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global( +define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB58_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB58_1; ; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB59_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB59_1; ; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB61_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB62_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB63_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB64_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB67_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB68_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB68_1; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB69_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB70_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB71_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB72_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB73_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB74_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB75_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB76_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB77_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB78_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB79_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB80_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB81_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global( +define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB82_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB83_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB84_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB85_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared( +define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB86_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB87_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB88_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB89_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB89_1; +; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB90_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB90_1; +; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB91_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB91_1; +; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB92_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB92_1; +; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB93_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB93_1; +; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB94_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB94_1; +; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB95_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB95_1; +; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB96_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB96_1; +; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB97_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB97_1; +; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB98_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB98_1; +; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB99_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB99_1; +; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB100_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB100_1; +; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB101_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB101_1; +; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB102_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB102_1; +; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB103_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB103_1; +; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB104_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB104_1; +; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB105_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB105_1; +; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB106_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB106_1; +; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB107_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB107_1; +; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB108_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB108_1; +; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB109_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB109_1; +; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB110_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB110_1; +; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB111_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB111_1; +; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB112_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB112_1; +; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB113_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB113_1; +; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB114_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB114_1; +; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB115_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB115_1; +; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB116_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB116_1; +; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB117_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB117_1; +; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB118_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB118_1; +; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB119_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB119_1; +; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB120_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB120_1; +; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB121_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB121_1; +; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB122_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB122_1; +; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB123_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB123_1; +; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB124_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB124_1; +; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB125_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB125_1; +; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB126_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB126_1; +; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB127_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB127_1; +; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB128_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB128_1; +; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB129_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB129_1; +; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB130_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB130_1; +; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB131_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB131_1; +; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB132_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB132_1; +; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB133_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB133_1; +; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB134_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB134_1; +; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB135_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB135_1; +; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB136_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB136_1; +; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB137_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB137_1; +; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB138_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB138_1; +; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB139_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB139_1; +; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB140_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB140_1; +; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB141_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB141_1; +; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB142_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB142_1; +; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB143_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB143_1; +; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB144_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB144_1; +; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB145_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB145_1; +; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB146_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB146_1; +; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB147_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB147_1; +; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB148_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB148_1; +; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB149_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB149_1; +; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB150_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB150_1; +; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB151_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB151_1; +; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB152_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB152_1; +; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB153_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB153_1; +; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB154_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB154_1; +; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB155_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB155_1; +; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB156_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB156_1; +; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB157_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB157_1; +; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB158_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB158_1; +; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB159_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB159_1; +; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB160_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB160_1; +; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB161_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB161_1; +; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB162_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB162_1; +; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB163_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB163_1; +; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB164_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB164_1; +; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB165_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB165_1; +; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB166_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB166_1; +; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB167_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB167_1; +; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB168_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB168_1; +; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB169_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB169_1; +; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB170_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB170_1; +; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB171_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB171_1; +; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB172_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB172_1; +; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB173_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB173_1; +; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB174_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB174_1; +; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB175_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB175_1; +; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB176_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB176_1; +; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB177_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB177_1; +; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB178_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB178_1; +; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB179_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB179_1; +; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB180_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB180_1; +; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB181_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB181_1; +; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB182_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB182_1; +; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB183_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB183_1; +; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB184_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB184_1; +; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB185_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB185_1; +; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB186_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB186_1; +; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB187_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB187_1; +; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB188_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB188_1; +; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB189_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB189_1; +; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB190_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB190_1; +; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB191_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB191_1; +; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB192_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB192_1; +; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB193_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB193_1; +; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB194_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB194_1; +; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB195_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB195_1; +; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB196_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB196_1; +; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB197_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB197_1; +; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB198_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB198_1; +; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB199_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB199_1; +; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB200_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB200_1; +; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB201_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB201_1; +; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB202_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB202_1; +; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB203_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB203_1; +; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB204_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB204_1; +; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB205_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB205_1; +; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB206_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB206_1; +; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB207_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB207_1; +; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB208_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB208_1; +; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB209_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB209_1; +; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB210_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB210_1; +; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB211_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB211_1; +; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB212_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB212_1; +; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB213_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB213_1; +; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB214_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB214_1; +; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB215_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB215_1; +; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB216_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB216_1; +; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB217_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB217_1; +; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB218_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB218_1; +; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB219_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB219_1; +; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB220_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB220_1; +; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB221_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB221_1; +; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB222_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB222_1; +; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB223_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB223_1; +; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB224_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB224_1; +; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB225_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB225_1; +; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB226_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB226_1; +; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB227_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB227_1; +; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB228_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB228_1; +; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB229_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB229_1; +; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB230_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB230_1; +; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB231_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB231_1; +; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB232_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB232_1; +; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB233_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB233_1; +; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB234_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB234_1; +; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB235_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB235_1; +; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB236_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB236_1; +; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB237_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB237_1; +; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB238_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB238_1; +; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB239_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB239_1; +; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB240_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB240_1; +; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB241_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB241_1; +; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB242_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB242_1; +; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB243_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB243_1; +; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB244_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB244_1; +; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB245_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB245_1; +; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB246_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB246_1; +; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB247_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB247_1; +; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB248_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB248_1; +; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB249_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB249_1; +; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB250_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB250_1; +; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB251_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB251_1; +; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB252_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB252_1; +; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB253_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB253_1; +; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB254_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB254_1; +; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB255_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB255_1; +; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB256_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB256_1; +; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB257_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB257_1; +; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB258_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB258_1; +; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB259_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB259_1; +; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB260_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB260_1; +; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB261_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB261_1; +; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB262_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB262_1; +; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB263_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB263_1; +; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB264_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB264_1; +; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB265_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB265_1; +; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB266_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB266_1; +; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB267_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB267_1; +; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB268_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB268_1; +; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB269_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB269_1; +; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB270_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB270_1; +; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB271_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB271_1; +; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB272_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB272_1; +; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB273_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB273_1; +; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB274_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB274_1; +; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB275_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB275_1; +; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB276_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB276_1; +; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB277_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB277_1; +; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB278_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB278_1; +; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB279_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB279_1; +; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB280_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB280_1; +; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB281_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB281_1; +; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB282_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB282_1; +; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB283_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB283_1; +; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB284_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB284_1; +; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB285_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB285_1; +; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB286_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB286_1; +; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB287_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB287_1; +; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB288_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB288_1; +; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB289_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB289_1; +; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB290_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB290_1; +; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB291_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB291_1; +; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB292_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB292_1; +; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB293_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB293_1; +; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB294_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB294_1; +; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB295_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB295_1; +; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB296_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB296_1; +; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB297_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB297_1; +; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB298_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB298_1; +; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB299_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB299_1; +; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB300_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB300_1; +; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB301_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB301_1; +; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB302_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB302_1; +; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB303_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB303_1; +; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB304_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB304_1; +; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB305_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB305_1; +; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB306_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB306_1; +; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB307_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB307_1; +; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB308_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB308_1; +; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB309_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB309_1; +; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB310_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB310_1; +; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB311_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB311_1; +; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB312_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB312_1; +; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB313_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB313_1; +; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB314_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB314_1; +; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB315_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB315_1; +; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB316_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB316_1; +; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB317_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB317_1; +; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB318_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB318_1; +; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB319_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB319_1; +; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB320_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB320_1; +; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB321_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB321_1; +; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB322_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB322_1; +; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB323_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB323_1; +; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB324_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB324_1; +; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB325_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB325_1; +; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB326_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB326_1; +; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB327_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB327_1; +; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB328_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB328_1; +; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB329_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB329_1; +; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB330_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB330_1; +; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB331_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB331_1; +; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB332_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB332_1; +; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB333_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB333_1; +; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB334_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB334_1; +; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB335_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB335_1; +; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB336_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB336_1; +; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB337_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB337_1; +; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB338_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB338_1; +; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB339_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB339_1; +; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB340_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB340_1; +; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB341_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB341_1; +; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB342_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB342_1; +; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB343_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB343_1; +; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB344_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB344_1; +; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB345_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB345_1; +; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB346_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB346_1; +; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB347_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB347_1; +; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB348_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB348_1; +; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB349_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB349_1; +; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB350_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB350_1; +; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB351_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB351_1; +; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB352_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB352_1; +; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB353_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB353_1; +; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB354_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB354_1; +; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB355_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB355_1; +; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB356_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB356_1; +; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB357_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB357_1; +; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB358_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB358_1; +; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB359_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB359_1; +; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB89_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB89_1; -; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic( +define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global( +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared( +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic( +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global( +define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared( +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic( +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global( +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared( +define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global( +define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared( +define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index aaea0d2ee25ef..7767cec9c4fcb 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; ; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i32( @@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; ; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i32( @@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; ; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i32( @@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; ; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i32( @@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i32( @@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; ; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i64( @@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; ; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i64( @@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; ; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i64( @@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; ; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i64( @@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i64( diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index ae7450015ecd2..277704bd9d5a5 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -5,8 +5,8 @@ from itertools import product cmpxchg_func = Template( - """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { - %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure + """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure ret i$size %new } """ @@ -38,9 +38,12 @@ for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace in product( - SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES + for size, success, failure, addrspace, llvm_scope in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES ): + # cluster ordering is supported from SM90 onwards + if sm != 90 and llvm_scope == "cluster": + continue if addrspace == 0: addrspace_cast = "" else: @@ -52,6 +55,8 @@ size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast, + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], ), file=fp, )